Spaces:

Doubiiu
/

TrajectoryCrafter

Running on Zero

App Files Files Community

TrajectoryCrafter commited on 16 days ago

Commit

0f56e8b

1 Parent(s): d794a86

update

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

LICENSE +201 -0
app.py +284 -0
demo.py +377 -0
docs/config_help.md +27 -0
extern/depthcrafter/__init__.py +0 -0
extern/depthcrafter/__pycache__/__init__.cpython-310.pyc +0 -0
extern/depthcrafter/__pycache__/demo.cpython-310.pyc +0 -0
extern/depthcrafter/__pycache__/depth_crafter_ppl.cpython-310.pyc +0 -0
extern/depthcrafter/__pycache__/infer.cpython-310.pyc +0 -0
extern/depthcrafter/__pycache__/unet.cpython-310.pyc +0 -0
extern/depthcrafter/__pycache__/utils.cpython-310.pyc +0 -0
extern/depthcrafter/depth_crafter_ppl.py +366 -0
extern/depthcrafter/infer.py +91 -0
extern/depthcrafter/unet.py +142 -0
extern/video_depth_anything/__pycache__/dinov2.cpython-310.pyc +0 -0
extern/video_depth_anything/__pycache__/dpt.cpython-310.pyc +0 -0
extern/video_depth_anything/__pycache__/dpt_temporal.cpython-310.pyc +0 -0
extern/video_depth_anything/__pycache__/vdademo.cpython-310.pyc +0 -0
extern/video_depth_anything/__pycache__/video_depth.cpython-310.pyc +0 -0
extern/video_depth_anything/dinov2.py +415 -0
extern/video_depth_anything/dinov2_layers/__init__.py +11 -0
extern/video_depth_anything/dinov2_layers/__pycache__/__init__.cpython-310.pyc +0 -0
extern/video_depth_anything/dinov2_layers/__pycache__/attention.cpython-310.pyc +0 -0
extern/video_depth_anything/dinov2_layers/__pycache__/block.cpython-310.pyc +0 -0
extern/video_depth_anything/dinov2_layers/__pycache__/drop_path.cpython-310.pyc +0 -0
extern/video_depth_anything/dinov2_layers/__pycache__/layer_scale.cpython-310.pyc +0 -0
extern/video_depth_anything/dinov2_layers/__pycache__/mlp.cpython-310.pyc +0 -0
extern/video_depth_anything/dinov2_layers/__pycache__/patch_embed.cpython-310.pyc +0 -0
extern/video_depth_anything/dinov2_layers/__pycache__/swiglu_ffn.cpython-310.pyc +0 -0
extern/video_depth_anything/dinov2_layers/attention.py +83 -0
extern/video_depth_anything/dinov2_layers/block.py +252 -0
extern/video_depth_anything/dinov2_layers/drop_path.py +35 -0
extern/video_depth_anything/dinov2_layers/layer_scale.py +28 -0
extern/video_depth_anything/dinov2_layers/mlp.py +41 -0
extern/video_depth_anything/dinov2_layers/patch_embed.py +89 -0
extern/video_depth_anything/dinov2_layers/swiglu_ffn.py +63 -0
extern/video_depth_anything/dpt.py +160 -0
extern/video_depth_anything/dpt_temporal.py +96 -0
extern/video_depth_anything/motion_module/__pycache__/attention.cpython-310.pyc +0 -0
extern/video_depth_anything/motion_module/__pycache__/motion_module.cpython-310.pyc +0 -0
extern/video_depth_anything/motion_module/attention.py +429 -0
extern/video_depth_anything/motion_module/motion_module.py +297 -0
extern/video_depth_anything/util/__pycache__/blocks.cpython-310.pyc +0 -0
extern/video_depth_anything/util/__pycache__/transform.cpython-310.pyc +0 -0
extern/video_depth_anything/util/__pycache__/util.cpython-310.pyc +0 -0
extern/video_depth_anything/util/blocks.py +162 -0
extern/video_depth_anything/util/transform.py +158 -0
extern/video_depth_anything/util/util.py +74 -0
extern/video_depth_anything/vdademo.py +63 -0
extern/video_depth_anything/video_depth.py +154 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

app.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import os
+import torch
+import sys
+from demo import TrajCrafter
+import random
+import gradio as gr
+import random
+from inference import get_parser
+from datetime import datetime
+import argparse
+# 解析命令行参数
+traj_examples = [
+    ['20; -30; 0.3; 0; 0'],
+    ['0; 0; -0.3; -2; 2'],
+]
+# inputs=[i2v_input_video, i2v_stride, i2v_center_scale, i2v_pose, i2v_steps, i2v_seed],
+img_examples = [
+    ['test/videos/0-NNvgaTcVzAG0-r.mp4',2,1,'0; -30; 0.5; -2; 0',50,43],
+    ['test/videos/tUfDESZsQFhdDW9S.mp4',2,1,'0; 30; -0.4; 2; 0',50,43],
+    ['test/videos/part-2-3.mp4',2,1,'20; 40; 0.5; 2; 0',50,43],
+    ['test/videos/p7.mp4',2,1,'0; -50; 0.3; 0; 0',50,43],
+    ['test/videos/UST-fn-RvhJwMR5S.mp4',2,1,'0; -35; 0.4; 0; 0',50,43],
+]
+max_seed = 2 ** 31
+parser = get_parser() # infer_config.py
+opts = parser.parse_args() # default device: 'cuda:0'
+opts.weight_dtype = torch.bfloat16
+tmp = datetime.now().strftime("%Y%m%d_%H%M")
+opts.save_dir = f'./experiments/gradio_{tmp}'
+os.makedirs(opts.save_dir,exist_ok=True)
+test_tensor = torch.Tensor([0]).cuda()
+opts.device = str(test_tensor.device)
+CAMERA_MOTION_MODE = ["Basic Camera Trajectory", "Custom Camera Trajectory"]
+def show_traj(mode):
+    if mode == 'Orbit Left':
+        return gr.update(value='0; -30; 0; 0; 0',visible=True),gr.update(visible=False)
+    elif mode == 'Orbit Right':
+        return gr.update(value='0; 30; 0; 0; 0',visible=True),gr.update(visible=False)
+    elif mode == 'Orbit Up':
+        return gr.update(value='30; 0; 0; 0; 0',visible=True),gr.update(visible=False)
+    elif mode == 'Orbit Down':
+        return gr.update(value='-20; 0; 0; 0; 0',visible=True), gr.update(visible=False)
+    if mode == 'Pan Left':
+        return gr.update(value='0; 0; 0; -2; 0',visible=True),gr.update(visible=False)
+    elif mode == 'Pan Right':
+        return gr.update(value='0; 0; 0; 2; 0',visible=True),gr.update(visible=False)
+    elif mode == 'Pan Up':
+        return gr.update(value='0; 0; 0; 0; 2',visible=True),gr.update(visible=False)
+    elif mode == 'Pan Down':
+        return gr.update(value='0; 0; 0; 0; -2',visible=True), gr.update(visible=False)
+    elif mode == 'Zoom in':
+        return gr.update(value='0; 0; 0.5; 0; 0',visible=True), gr.update(visible=False)
+    elif mode == 'Zoom out':
+        return gr.update(value='0; 0; -0.5; 0; 0',visible=True), gr.update(visible=False)
+    elif mode == 'Customize':
+        return gr.update(value='0; 0; 0; 0; 0',visible=True), gr.update(visible=True)
+    elif mode == 'Reset':
+        return gr.update(value='0; 0; 0; 0; 0',visible=False), gr.update(visible=False)
+def trajcrafter_demo(opts):
+    # css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height:576px} #random_button {max-width: 100px !important}"""
+    css = """
+    #input_img {max-width: 1024px !important}
+    #output_vid {max-width: 1024px; max-height:576px}
+    #random_button {max-width: 100px !important}
+    .generate-btn {
+        background: linear-gradient(45deg, #2196F3, #1976D2) !important;
+        border: none !important;
+        color: white !important;
+        font-weight: bold !important;
+        box-shadow: 0 2px 5px rgba(0,0,0,0.2) !important;
+    }
+    .generate-btn:hover {
+        background: linear-gradient(45deg, #1976D2, #1565C0) !important;
+        box-shadow: 0 4px 8px rgba(0,0,0,0.3) !important;
+    }
+    """
+    image2video = TrajCrafter(opts,gradio=True)
+    # image2video.run_both = spaces.GPU(image2video.run_both, duration=290) # fixme
+    with gr.Blocks(analytics_enabled=False, css=css) as trajcrafter_iface:
+        gr.Markdown("<div align='center'> <h1> TrajectoryCrafter: Redirecting View Trajectory for Monocular Videos via Diffusion Models </span> </h1>")
+        #             #   <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
+        #             #  <a style='font-size:18px;color: #000000' href='https://arxiv.org/abs/2409.02048'> [ArXiv] </a>\
+        #             #  <a style='font-size:18px;color: #000000' href='https://drexubery.github.io/ViewCrafter/'> [Project Page] </a>\
+        #             #  <a style='font-size:18px;color: #FF5DB0' href='https://github.com/Drexubery/ViewCrafter'> [Github] </a>\
+        #             #  <a style='font-size:18px;color: #000000' href='https://www.youtube.com/watch?v=WGIEmu9eXmU'> [Video] </a> </div>")
+        with gr.Row(equal_height=True):
+            with gr.Column():
+                # # step 1: input an image
+                # gr.Markdown("---\n## Step 1: Input an Image, selet an elevation angle and a center_scale factor", show_label=False, visible=True)
+                # gr.Markdown("<div align='left' style='font-size:18px;color: #000000'>1. Estimate an elevation angle  that represents the angle at which the image was taken; a value bigger than 0 indicates a top-down view, and it doesn't need to be precise. <br>2. The origin of the world coordinate system is by default defined at the point cloud corresponding to the center pixel of the input image. You can adjust the position of the origin by modifying center_scale; a value smaller than 1 brings the origin closer to you.</div>")
+                i2v_input_video = gr.Video(label="Input Video", elem_id="input_video", format="mp4")
+            with gr.Column():
+                i2v_output_video = gr.Video(label="Generated Video", elem_id="output_vid", autoplay=True,
+                                            show_share_button=True)
+        with gr.Row():
+            with gr.Row():
+                i2v_stride = gr.Slider(minimum=1, maximum=3, step=1, elem_id="stride", label="Stride", value=1)
+                i2v_center_scale = gr.Slider(minimum=0.1, maximum=2, step=0.1, elem_id="i2v_center_scale",
+                                             label="center_scale", value=1)
+                i2v_steps = gr.Slider(minimum=1, maximum=50, step=1, elem_id="i2v_steps", label="Sampling steps",
+                                      value=50)
+                i2v_seed = gr.Slider(label='Random seed', minimum=0, maximum=max_seed, step=1, value=43)
+            with gr.Row():
+                pan_left = gr.Button(value="Pan Left")
+                pan_right = gr.Button(value="Pan Right")
+                pan_up = gr.Button(value="Pan Up")
+                pan_down = gr.Button(value="Pan Down")
+            with gr.Row():
+                orbit_left = gr.Button(value="Orbit Left")
+                orbit_right = gr.Button(value="Orbit Right")
+                orbit_up = gr.Button(value="Orbit Up")
+                orbit_down = gr.Button(value="Orbit Down")
+            with gr.Row():
+                zin = gr.Button(value="Zoom in")
+                zout = gr.Button(value="Zoom out")
+                custom = gr.Button(value="Customize")
+                reset = gr.Button(value="Reset")
+            with gr.Column():
+                with gr.Row():
+                    with gr.Column():
+                        i2v_pose = gr.Text(value='0; 0; 0; 0; 0', label="Traget camera pose (theta, phi, r, x, y)",
+                                           visible=False)
+                        with gr.Column(visible=False) as i2v_egs:
+                            gr.Markdown(
+                                "<div align='left' style='font-size:18px;color: #000000'>Please refer to <a href='https://github.com/TrajectoryCrafter/TrajectoryCrafter/blob/main/docs/config_help.md' target='_blank'>tutorial</a> for customizing camera trajectory.</div>")
+                            gr.Examples(examples=traj_examples,
+                                        inputs=[i2v_pose],
+                                        )
+            with gr.Column():
+                i2v_end_btn = gr.Button("Generate video", scale=2, size="lg", variant="primary", elem_classes="generate-btn")
+                # with gr.Column():
+                #     i2v_input_video = gr.Video(label="Input Video", elem_id="input_video", format="mp4")
+                    # i2v_input_image = gr.Image(label="Input Image",elem_id="input_img")
+                    # with gr.Row():
+                    #     # i2v_elevation = gr.Slider(minimum=-45, maximum=45, step=1, elem_id="elevation", label="elevation", value=5)
+                    #     i2v_center_scale = gr.Slider(minimum=0.1, maximum=2, step=0.1, elem_id="i2v_center_scale", label="center_scale", value=1)
+                    #     i2v_steps = gr.Slider(minimum=1, maximum=50, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
+                    #     i2v_seed = gr.Slider(label='Random seed', minimum=0, maximum=max_seed, step=1, value=43)
+                # with  gr.Column():
+                #     with gr.Row():
+                #         left = gr.Button(value = "Left")
+                #         right = gr.Button(value = "Right")
+                #         up = gr.Button(value = "Up")
+                #     with gr.Row():
+                #         down = gr.Button(value = "Down")
+                #         zin = gr.Button(value = "Zoom in")
+                #         zout = gr.Button(value = "Zoom out")
+                #     with gr.Row():
+                #         custom = gr.Button(value = "Customize")
+                #         reset = gr.Button(value = "Reset")
+            # step 3 - Generate video
+            # with gr.Column():
+                # gr.Markdown("---\n## Step 3: Generate video", show_label=False, visible=True)
+                # gr.Markdown("<div align='left' style='font-size:18px;color: #000000'> You can reduce the sampling steps for faster inference; try different random seed if the result is not satisfying. </div>")
+                # i2v_output_video = gr.Video(label="Generated Video",elem_id="output_vid",autoplay=True,show_share_button=True)
+                # i2v_end_btn = gr.Button("Generate video")
+                # i2v_traj_video = gr.Video(label="Camera Trajectory",elem_id="traj_vid",autoplay=True,show_share_button=True)
+            # with  gr.Column(scale=1.5):
+                # with gr.Row():
+                #     # i2v_elevation = gr.Slider(minimum=-45, maximum=45, step=1, elem_id="elevation", label="elevation", value=5)
+                #     i2v_center_scale = gr.Slider(minimum=0.1, maximum=2, step=0.1, elem_id="i2v_center_scale", label="center_scale", value=1)
+                #     i2v_steps = gr.Slider(minimum=1, maximum=50, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
+                #     i2v_seed = gr.Slider(label='Random seed', minimum=0, maximum=max_seed, step=1, value=43)
+                # with gr.Row():
+                #     pan_left = gr.Button(value = "Pan Left")
+                #     pan_right = gr.Button(value = "Pan Right")
+                #     pan_up = gr.Button(value = "Pan Up")
+                #     pan_down = gr.Button(value = "Pan Down")
+                # with gr.Row():
+                #     orbit_left = gr.Button(value = "Orbit Left")
+                #     orbit_right = gr.Button(value = "Orbit Right")
+                #     orbit_up = gr.Button(value = "Orbit Up")
+                #     orbit_down = gr.Button(value = "Orbit Down")
+                # with gr.Row():
+                #     zin = gr.Button(value = "Zoom in")
+                #     zout = gr.Button(value = "Zoom out")
+                #     custom = gr.Button(value = "Customize")
+                #     reset = gr.Button(value = "Reset")
+                # with gr.Column():
+                #      with gr.Row():
+                #         with gr.Column():
+                #             i2v_pose = gr.Text(value = '0; 0; 0; 0; 0', label="Traget camera pose (theta, phi, r, x, y)",visible=False)
+                #             with gr.Column(visible=False) as i2v_egs:
+                #                 gr.Markdown("<div align='left' style='font-size:18px;color: #000000'>Please refer to the <a href='https://github.com/Drexubery/ViewCrafter/blob/main/docs/gradio_tutorial.md' target='_blank'>tutorial</a> for customizing camera trajectory.</div>")
+                #                 gr.Examples(examples=traj_examples,
+                #                         inputs=[i2v_pose],
+                #                     )
+                # with gr.Row():
+                #     i2v_end_btn = gr.Button("Generate video")
+        # step 3 - Generate video
+        # with gr.Row():
+        #     with gr.Column():
+        i2v_end_btn.click(inputs=[i2v_input_video, i2v_stride, i2v_center_scale, i2v_pose, i2v_steps, i2v_seed],
+                        outputs=[i2v_output_video],
+                        fn = image2video.run_gradio
+        )
+        pan_left.click(inputs=[pan_left],
+                      outputs=[i2v_pose,i2v_egs],
+                      fn = show_traj
+                      )
+        pan_right.click(inputs=[pan_right],
+                      outputs=[i2v_pose,i2v_egs],
+                      fn = show_traj
+                      )
+        pan_up.click(inputs=[pan_up],
+                      outputs=[i2v_pose,i2v_egs],
+                      fn = show_traj
+                      )
+        pan_down.click(inputs=[pan_down],
+                      outputs=[i2v_pose,i2v_egs],
+                      fn = show_traj
+                      )
+        orbit_left.click(inputs=[orbit_left],
+                      outputs=[i2v_pose,i2v_egs],
+                      fn = show_traj
+                      )
+        orbit_right.click(inputs=[orbit_right],
+                      outputs=[i2v_pose,i2v_egs],
+                      fn = show_traj
+                      )
+        orbit_up.click(inputs=[orbit_up],
+                      outputs=[i2v_pose,i2v_egs],
+                      fn = show_traj
+                      )
+        orbit_down.click(inputs=[orbit_down],
+                      outputs=[i2v_pose,i2v_egs],
+                      fn = show_traj
+                      )
+        zin.click(inputs=[zin],
+                      outputs=[i2v_pose,i2v_egs],
+                      fn = show_traj
+                      )
+        zout.click(inputs=[zout],
+                      outputs=[i2v_pose,i2v_egs],
+                      fn = show_traj
+                      )
+        custom.click(inputs=[custom],
+                      outputs=[i2v_pose,i2v_egs],
+                      fn = show_traj
+                      )
+        reset.click(inputs=[reset],
+                      outputs=[i2v_pose,i2v_egs],
+                      fn = show_traj
+                      )
+        gr.Examples(examples=img_examples,
+            # inputs=[i2v_input_video,i2v_stride],
+            inputs=[i2v_input_video, i2v_stride, i2v_center_scale, i2v_pose, i2v_steps, i2v_seed],
+        )
+    return trajcrafter_iface
+trajcrafter_iface = trajcrafter_demo(opts)
+trajcrafter_iface.queue(max_size=10)
+# trajcrafter_iface.launch(server_name=args.server_name, max_threads=10, debug=True)
+trajcrafter_iface.launch(server_name="0.0.0.0", server_port=12345, debug=True, share=False, max_threads=10)

demo.py ADDED Viewed

	@@ -0,0 +1,377 @@

+import gc
+import os
+import torch
+from extern.depthcrafter.infer import DepthCrafterDemo
+# from extern.video_depth_anything.vdademo import VDADemo
+import numpy as np
+import torch
+from transformers import T5EncoderModel
+from omegaconf import OmegaConf
+from PIL import Image
+from models.crosstransformer3d import CrossTransformer3DModel
+from models.autoencoder_magvit import AutoencoderKLCogVideoX
+from models.pipeline_trajectorycrafter import TrajCrafter_Pipeline
+from models.utils import *
+from diffusers import (AutoencoderKL, CogVideoXDDIMScheduler, DDIMScheduler,
+                       DPMSolverMultistepScheduler,
+                       EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
+                       PNDMScheduler)
+from transformers import AutoProcessor, Blip2ForConditionalGeneration
+class TrajCrafter:
+    def __init__(self, opts, gradio=False):
+        self.funwarp = Warper(device=opts.device)
+        # self.depth_estimater = VDADemo(pre_train_path=opts.pre_train_path_vda,device=opts.device)
+        self.depth_estimater = DepthCrafterDemo(unet_path=opts.unet_path,pre_train_path=opts.pre_train_path,cpu_offload=opts.cpu_offload,device=opts.device)
+        self.caption_processor = AutoProcessor.from_pretrained(opts.blip_path)
+        self.captioner = Blip2ForConditionalGeneration.from_pretrained(opts.blip_path, torch_dtype=torch.float16).to(opts.device)
+        self.setup_diffusion(opts)
+        if gradio:
+            self.opts=opts
+    def infer_gradual(self,opts):
+        frames = read_video_frames(opts.video_path,opts.video_length,opts.stride,opts.max_res)
+        prompt = self.get_caption(opts,frames[opts.video_length//2])
+        # depths= self.depth_estimater.infer(frames, opts.near, opts.far).to(opts.device)
+        depths= self.depth_estimater.infer(frames, opts.near, opts.far, opts.depth_inference_steps, opts.depth_guidance_scale, window_size=opts.window_size, overlap=opts.overlap).to(opts.device)
+        frames = torch.from_numpy(frames).permute(0,3,1,2).to(opts.device)*2.-1. # 49 576 1024 3 -> 49 3 576 1024, [-1,1]
+        assert frames.shape[0] == opts.video_length
+        pose_s, pose_t, K = self.get_poses(opts,depths,num_frames = opts.video_length)
+        warped_images = []
+        masks = []
+        for i in tqdm(range(opts.video_length)):
+            warped_frame2, mask2, warped_depth2, flow12 = self.funwarp.forward_warp(frames[i:i+1], None, depths[i:i+1], pose_s[i:i+1], pose_t[i:i+1], K[i:i+1], None, opts.mask,twice=False)
+            warped_images.append(warped_frame2)
+            masks.append(mask2)
+        cond_video = (torch.cat(warped_images)+1.)/2.
+        cond_masks = torch.cat(masks)
+        frames = F.interpolate(frames, size=opts.sample_size, mode='bilinear', align_corners=False)
+        cond_video = F.interpolate(cond_video, size=opts.sample_size, mode='bilinear', align_corners=False)
+        cond_masks = F.interpolate(cond_masks, size=opts.sample_size, mode='nearest')
+        save_video((frames.permute(0,2,3,1)+1.)/2., os.path.join(opts.save_dir,'input.mp4'),fps=opts.fps)
+        save_video(cond_video.permute(0,2,3,1), os.path.join(opts.save_dir,'render.mp4'),fps=opts.fps)
+        save_video(cond_masks.repeat(1,3,1,1).permute(0,2,3,1), os.path.join(opts.save_dir,'mask.mp4'),fps=opts.fps)
+        frames = (frames.permute(1,0,2,3).unsqueeze(0)+1.)/2.
+        frames_ref = frames[:,:,:10,:,:]
+        cond_video = cond_video.permute(1,0,2,3).unsqueeze(0)
+        cond_masks = (1.-cond_masks.permute(1,0,2,3).unsqueeze(0))*255.
+        generator = torch.Generator(device=opts.device).manual_seed(opts.seed)
+        del self.depth_estimater
+        del self.caption_processor
+        del self.captioner
+        gc.collect()
+        torch.cuda.empty_cache()
+        with torch.no_grad():
+            sample = self.pipeline(
+                prompt,
+                num_frames = opts.video_length,
+                negative_prompt = opts.negative_prompt,
+                height      = opts.sample_size[0],
+                width       = opts.sample_size[1],
+                generator   = generator,
+                guidance_scale = opts.diffusion_guidance_scale,
+                num_inference_steps = opts.diffusion_inference_steps,
+                video        = cond_video,
+                mask_video   = cond_masks,
+                reference    = frames_ref,
+            ).videos
+        save_video(sample[0].permute(1,2,3,0), os.path.join(opts.save_dir,'gen.mp4'), fps=opts.fps)
+        viz = True
+        if viz:
+            tensor_left = frames[0].to(opts.device)
+            tensor_right = sample[0].to(opts.device)
+            interval = torch.ones(3, 49, 384, 30).to(opts.device)
+            result = torch.cat((tensor_left, interval, tensor_right), dim=3)
+            result_reverse = torch.flip(result, dims=[1])
+            final_result = torch.cat((result, result_reverse[:,1:,:,:]), dim=1)
+            save_video(final_result.permute(1,2,3,0), os.path.join(opts.save_dir,'viz.mp4'), fps=opts.fps*2)
+    def infer_direct(self,opts):
+        opts.cut = 20
+        frames = read_video_frames(opts.video_path,opts.video_length,opts.stride,opts.max_res)
+        prompt = self.get_caption(opts,frames[opts.video_length//2])
+        # depths= self.depth_estimater.infer(frames, opts.near, opts.far).to(opts.device)
+        depths= self.depth_estimater.infer(frames, opts.near, opts.far, opts.depth_inference_steps, opts.depth_guidance_scale, window_size=opts.window_size, overlap=opts.overlap).to(opts.device)
+        frames = torch.from_numpy(frames).permute(0,3,1,2).to(opts.device)*2.-1. # 49 576 1024 3 -> 49 3 576 1024, [-1,1]
+        assert frames.shape[0] == opts.video_length
+        pose_s, pose_t, K = self.get_poses(opts,depths,num_frames = opts.cut)
+        warped_images = []
+        masks = []
+        for i in tqdm(range(opts.video_length)):
+            if i < opts.cut:
+                warped_frame2, mask2, warped_depth2, flow12 = self.funwarp.forward_warp(frames[0:1], None, depths[0:1], pose_s[0:1], pose_t[i:i+1], K[0:1], None, opts.mask,twice=False)
+                warped_images.append(warped_frame2)
+                masks.append(mask2)
+            else:
+                warped_frame2, mask2, warped_depth2, flow12 = self.funwarp.forward_warp(frames[i-opts.cut:i-opts.cut+1], None, depths[i-opts.cut:i-opts.cut+1], pose_s[0:1], pose_t[-1:], K[0:1], None, opts.mask,twice=False)
+                warped_images.append(warped_frame2)
+                masks.append(mask2)
+        cond_video = (torch.cat(warped_images)+1.)/2.
+        cond_masks = torch.cat(masks)
+        frames = F.interpolate(frames, size=opts.sample_size, mode='bilinear', align_corners=False)
+        cond_video = F.interpolate(cond_video, size=opts.sample_size, mode='bilinear', align_corners=False)
+        cond_masks = F.interpolate(cond_masks, size=opts.sample_size, mode='nearest')
+        save_video((frames[:opts.video_length-opts.cut].permute(0,2,3,1)+1.)/2., os.path.join(opts.save_dir,'input.mp4'),fps=opts.fps)
+        save_video(cond_video[opts.cut:].permute(0,2,3,1), os.path.join(opts.save_dir,'render.mp4'),fps=opts.fps)
+        save_video(cond_masks[opts.cut:].repeat(1,3,1,1).permute(0,2,3,1), os.path.join(opts.save_dir,'mask.mp4'),fps=opts.fps)
+        frames = (frames.permute(1,0,2,3).unsqueeze(0)+1.)/2.
+        frames_ref = frames[:,:,:10,:,:]
+        cond_video = cond_video.permute(1,0,2,3).unsqueeze(0)
+        cond_masks = (1.-cond_masks.permute(1,0,2,3).unsqueeze(0))*255.
+        generator = torch.Generator(device=opts.device).manual_seed(opts.seed)
+        del self.depth_estimater
+        del self.caption_processor
+        del self.captioner
+        gc.collect()
+        torch.cuda.empty_cache()
+        with torch.no_grad():
+            sample = self.pipeline(
+                prompt,
+                num_frames = opts.video_length,
+                negative_prompt = opts.negative_prompt,
+                height      = opts.sample_size[0],
+                width       = opts.sample_size[1],
+                generator   = generator,
+                guidance_scale = opts.diffusion_guidance_scale,
+                num_inference_steps = opts.diffusion_inference_steps,
+                video        = cond_video,
+                mask_video   = cond_masks,
+                reference    = frames_ref,
+            ).videos
+        save_video(sample[0].permute(1,2,3,0)[opts.cut:], os.path.join(opts.save_dir,'gen.mp4'), fps=opts.fps)
+        viz = True
+        if viz:
+            tensor_left = frames[0][:,:opts.video_length-opts.cut,...].to(opts.device)
+            tensor_right = sample[0][:,opts.cut:,...].to(opts.device)
+            interval = torch.ones(3, opts.video_length-opts.cut, 384, 30).to(opts.device)
+            result = torch.cat((tensor_left, interval, tensor_right), dim=3)
+            result_reverse = torch.flip(result, dims=[1])
+            final_result = torch.cat((result, result_reverse[:,1:,:,:]), dim=1)
+            save_video(final_result.permute(1,2,3,0), os.path.join(opts.save_dir,'viz.mp4'), fps=opts.fps*2)
+    def infer_bullet(self,opts):
+        frames = read_video_frames(opts.video_path,opts.video_length,opts.stride,opts.max_res)
+        prompt = self.get_caption(opts,frames[opts.video_length//2])
+        # depths= self.depth_estimater.infer(frames, opts.near, opts.far).to(opts.device)
+        depths= self.depth_estimater.infer(frames, opts.near, opts.far, opts.depth_inference_steps, opts.depth_guidance_scale, window_size=opts.window_size, overlap=opts.overlap).to(opts.device)
+        frames = torch.from_numpy(frames).permute(0,3,1,2).to(opts.device)*2.-1. # 49 576 1024 3 -> 49 3 576 1024, [-1,1]
+        assert frames.shape[0] == opts.video_length
+        pose_s, pose_t, K = self.get_poses(opts,depths, num_frames = opts.video_length)
+        warped_images = []
+        masks = []
+        for i in tqdm(range(opts.video_length)):
+            warped_frame2, mask2, warped_depth2, flow12 = self.funwarp.forward_warp(frames[-1:], None, depths[-1:], pose_s[0:1], pose_t[i:i+1], K[0:1], None, opts.mask,twice=False)
+            warped_images.append(warped_frame2)
+            masks.append(mask2)
+        cond_video = (torch.cat(warped_images)+1.)/2.
+        cond_masks = torch.cat(masks)
+        frames = F.interpolate(frames, size=opts.sample_size, mode='bilinear', align_corners=False)
+        cond_video = F.interpolate(cond_video, size=opts.sample_size, mode='bilinear', align_corners=False)
+        cond_masks = F.interpolate(cond_masks, size=opts.sample_size, mode='nearest')
+        save_video((frames.permute(0,2,3,1)+1.)/2., os.path.join(opts.save_dir,'input.mp4'),fps=opts.fps)
+        save_video(cond_video.permute(0,2,3,1), os.path.join(opts.save_dir,'render.mp4'),fps=opts.fps)
+        save_video(cond_masks.repeat(1,3,1,1).permute(0,2,3,1), os.path.join(opts.save_dir,'mask.mp4'),fps=opts.fps)
+        frames = (frames.permute(1,0,2,3).unsqueeze(0)+1.)/2.
+        frames_ref = frames[:,:,-10:,:,:]
+        cond_video = cond_video.permute(1,0,2,3).unsqueeze(0)
+        cond_masks = (1.-cond_masks.permute(1,0,2,3).unsqueeze(0))*255.
+        generator = torch.Generator(device=opts.device).manual_seed(opts.seed)
+        del self.depth_estimater
+        del self.caption_processor
+        del self.captioner
+        gc.collect()
+        torch.cuda.empty_cache()
+        with torch.no_grad():
+            sample = self.pipeline(
+                prompt,
+                num_frames = opts.video_length,
+                negative_prompt = opts.negative_prompt,
+                height      = opts.sample_size[0],
+                width       = opts.sample_size[1],
+                generator   = generator,
+                guidance_scale = opts.diffusion_guidance_scale,
+                num_inference_steps = opts.diffusion_inference_steps,
+                video        = cond_video,
+                mask_video   = cond_masks,
+                reference    = frames_ref,
+            ).videos
+        save_video(sample[0].permute(1,2,3,0), os.path.join(opts.save_dir,'gen.mp4'), fps=opts.fps)
+        viz = True
+        if viz:
+            tensor_left = frames[0].to(opts.device)
+            tensor_left_full = torch.cat([tensor_left,tensor_left[:,-1:,:,:].repeat(1,48,1,1)],dim=1)
+            tensor_right = sample[0].to(opts.device)
+            tensor_right_full = torch.cat([tensor_left,tensor_right[:,1:,:,:]],dim=1)
+            interval = torch.ones(3, 49*2-1, 384, 30).to(opts.device)
+            result = torch.cat((tensor_left_full, interval, tensor_right_full), dim=3)
+            result_reverse = torch.flip(result, dims=[1])
+            final_result = torch.cat((result, result_reverse[:,1:,:,:]), dim=1)
+            save_video(final_result.permute(1,2,3,0), os.path.join(opts.save_dir,'viz.mp4'), fps=opts.fps*4)
+    def get_caption(self,opts,image):
+        image_array = (image * 255).astype(np.uint8)
+        pil_image = Image.fromarray(image_array)
+        inputs = self.caption_processor(images=pil_image, return_tensors="pt").to(opts.device, torch.float16)
+        generated_ids = self.captioner.generate(**inputs)
+        generated_text = self.caption_processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        return generated_text + opts.refine_prompt
+    def get_poses(self,opts,depths,num_frames):
+        radius = depths[0,0,depths.shape[-2]//2,depths.shape[-1]//2].cpu()*opts.radius_scale
+        radius = min(radius, 5)
+        cx = 512. #depths.shape[-1]//2
+        cy = 288. #depths.shape[-2]//2
+        f = 500 #500.
+        K = torch.tensor([[f,   0., cx],[  0., f, cy],[  0.,   0.,   1.]]).repeat(num_frames,1,1).to(opts.device)
+        c2w_init = torch.tensor([[-1.,  0.,  0.,  0.],
+                                [ 0.,  1.,  0.,  0.],
+                                [ 0.,  0., -1., 0.],
+                                [ 0.,  0.,  0.,  1.]]).to(opts.device).unsqueeze(0)
+        if opts.camera == 'target':
+            dtheta, dphi, dr, dx, dy = opts.target_pose
+            poses = generate_traj_specified(c2w_init, dtheta, dphi, dr*radius, dx, dy, num_frames, opts.device)
+        elif opts.camera =='traj':
+            with open(opts.traj_txt, 'r') as file:
+                lines = file.readlines()
+                theta = [float(i) for i in lines[0].split()]
+                phi = [float(i) for i in lines[1].split()]
+                r = [float(i)*radius for i in lines[2].split()]
+            poses = generate_traj_txt(c2w_init, phi, theta, r, num_frames, opts.device)
+        poses[:,2, 3] = poses[:,2, 3] + radius
+        pose_s = poses[opts.anchor_idx:opts.anchor_idx+1].repeat(num_frames,1,1)
+        pose_t = poses
+        return pose_s, pose_t, K
+    def setup_diffusion(self,opts):
+        # transformer = CrossTransformer3DModel.from_pretrained_cus(opts.transformer_path).to(opts.weight_dtype)
+        transformer = CrossTransformer3DModel.from_pretrained(opts.transformer_path).to(opts.weight_dtype)
+        # transformer = transformer.to(opts.weight_dtype)
+        vae = AutoencoderKLCogVideoX.from_pretrained(
+            opts.model_name,
+            subfolder="vae"
+        ).to(opts.weight_dtype)
+        text_encoder = T5EncoderModel.from_pretrained(
+            opts.model_name, subfolder="text_encoder", torch_dtype=opts.weight_dtype
+        )
+        # Get Scheduler
+        Choosen_Scheduler  = {
+            "Euler": EulerDiscreteScheduler,
+            "Euler A": EulerAncestralDiscreteScheduler,
+            "DPM++": DPMSolverMultistepScheduler,
+            "PNDM": PNDMScheduler,
+            "DDIM_Cog": CogVideoXDDIMScheduler,
+            "DDIM_Origin": DDIMScheduler,
+        }[opts.sampler_name]
+        scheduler = Choosen_Scheduler.from_pretrained(
+            opts.model_name,
+            subfolder="scheduler"
+        )
+        self.pipeline = TrajCrafter_Pipeline.from_pretrained(
+            opts.model_name,
+            vae=vae,
+            text_encoder=text_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+            torch_dtype=opts.weight_dtype
+        )
+        if opts.low_gpu_memory_mode:
+            self.pipeline.enable_sequential_cpu_offload()
+        else:
+            self.pipeline.enable_model_cpu_offload()
+    def run_gradio(self,input_video, stride, radius_scale, pose, steps, seed):
+        frames = read_video_frames(input_video, self.opts.video_length, stride,self.opts.max_res)
+        prompt = self.get_caption(self.opts,frames[self.opts.video_length//2])
+        # depths= self.depth_estimater.infer(frames, opts.near, opts.far).to(opts.device)
+        depths= self.depth_estimater.infer(frames, self.opts.near, self.opts.far, self.opts.depth_inference_steps, self.opts.depth_guidance_scale, window_size=self.opts.window_size, overlap=self.opts.overlap).to(self.opts.device)
+        frames = torch.from_numpy(frames).permute(0,3,1,2).to(self.opts.device)*2.-1. # 49 576 1024 3 -> 49 3 576 1024, [-1,1]
+        num_frames = frames.shape[0]
+        assert num_frames == self.opts.video_length
+        radius_scale = float(radius_scale)
+        radius = depths[0,0,depths.shape[-2]//2,depths.shape[-1]//2].cpu()*radius_scale
+        radius = min(radius, 5)
+        cx = 512. #depths.shape[-1]//2
+        cy = 288. #depths.shape[-2]//2
+        f = 500 #500.
+        K = torch.tensor([[f,   0., cx],[  0., f, cy],[  0.,   0.,   1.]]).repeat(num_frames,1,1).to(self.opts.device)
+        c2w_init = torch.tensor([[-1.,  0.,  0.,  0.],
+                                [ 0.,  1.,  0.,  0.],
+                                [ 0.,  0., -1., 0.],
+                                [ 0.,  0.,  0.,  1.]]).to(self.opts.device).unsqueeze(0)
+        # import pdb
+        # pdb.set_trace()
+        theta,phi,r,x,y = [float(i) for i in pose.split(';')]
+        # theta,phi,r,x,y = [float(i) for i in theta.split()],[float(i) for i in phi.split()],[float(i) for i in r.split()],[float(i) for i in x.split()],[float(i) for i in y.split()]
+        # target mode
+        poses = generate_traj_specified(c2w_init, theta, phi, r*radius, x, y, num_frames, self.opts.device)
+        poses[:,2, 3] = poses[:,2, 3] + radius
+        pose_s = poses[self.opts.anchor_idx:self.opts.anchor_idx+1].repeat(num_frames,1,1)
+        pose_t = poses
+        warped_images = []
+        masks = []
+        for i in tqdm(range(self.opts.video_length)):
+            warped_frame2, mask2, warped_depth2, flow12 = self.funwarp.forward_warp(frames[i:i+1], None, depths[i:i+1], pose_s[i:i+1], pose_t[i:i+1], K[i:i+1], None, self.opts.mask,twice=False)
+            warped_images.append(warped_frame2)
+            masks.append(mask2)
+        cond_video = (torch.cat(warped_images)+1.)/2.
+        cond_masks = torch.cat(masks)
+        frames = F.interpolate(frames, size=self.opts.sample_size, mode='bilinear', align_corners=False)
+        cond_video = F.interpolate(cond_video, size=self.opts.sample_size, mode='bilinear', align_corners=False)
+        cond_masks = F.interpolate(cond_masks, size=self.opts.sample_size, mode='nearest')
+        save_video((frames.permute(0,2,3,1)+1.)/2., os.path.join(self.opts.save_dir,'input.mp4'),fps=self.opts.fps)
+        save_video(cond_video.permute(0,2,3,1), os.path.join(self.opts.save_dir,'render.mp4'),fps=self.opts.fps)
+        save_video(cond_masks.repeat(1,3,1,1).permute(0,2,3,1), os.path.join(self.opts.save_dir,'mask.mp4'),fps=self.opts.fps)
+        frames = (frames.permute(1,0,2,3).unsqueeze(0)+1.)/2.
+        frames_ref = frames[:,:,:10,:,:]
+        cond_video = cond_video.permute(1,0,2,3).unsqueeze(0)
+        cond_masks = (1.-cond_masks.permute(1,0,2,3).unsqueeze(0))*255.
+        generator = torch.Generator(device=self.opts.device).manual_seed(seed)
+        # del self.depth_estimater
+        # del self.caption_processor
+        # del self.captioner
+        # gc.collect()
+        torch.cuda.empty_cache()
+        with torch.no_grad():
+            sample = self.pipeline(
+                prompt,
+                num_frames = self.opts.video_length,
+                negative_prompt = self.opts.negative_prompt,
+                height      = self.opts.sample_size[0],
+                width       = self.opts.sample_size[1],
+                generator   = generator,
+                guidance_scale = self.opts.diffusion_guidance_scale,
+                num_inference_steps = steps,
+                video        = cond_video,
+                mask_video   = cond_masks,
+                reference    = frames_ref,
+            ).videos
+        save_video(sample[0].permute(1,2,3,0), os.path.join(self.opts.save_dir,'gen.mp4'), fps=self.opts.fps)
+        viz = True
+        if viz:
+            tensor_left = frames[0].to(self.opts.device)
+            tensor_right = sample[0].to(self.opts.device)
+            interval = torch.ones(3, 49, 384, 30).to(self.opts.device)
+            result = torch.cat((tensor_left, interval, tensor_right), dim=3)
+            result_reverse = torch.flip(result, dims=[1])
+            final_result = torch.cat((result, result_reverse[:,1:,:,:]), dim=1)
+            save_video(final_result.permute(1,2,3,0), os.path.join(self.opts.save_dir,'viz.mp4'), fps=self.opts.fps*2)
+        return os.path.join(self.opts.save_dir,'viz.mp4')

docs/config_help.md ADDED Viewed

	@@ -0,0 +1,27 @@

+## Important configuration for [inference.py](../inference.py):
+### 1. General configs
+| Configuration     | Default Value   | Explanation                                              |
+|:----------------- |:--------------- |:-------------------------------------------------------- |
+| `--video_path`    | `None`            | Input video file path                                    |
+| `--out_dir`       | `./experiments/`| Output directory                                         |
+| `--device`        | `cuda:0`        | The device to use (e.g., CPU or GPU)                     |
+| `--exp_name`      | `None`           | Experiment name, defaults to video file name             |
+| `--seed`          | `43`            | Random seed for reproducibility                          |
+| `--video_length`  | `49`            | Length of the video frames (number of frames)            |
+| `--fps`           | `10`            | fps for saved video                  |
+| `--stride`        | `1`             | Sampling stride for input video (frame interval)         |
+| `--server_name`   | `None`            | Server IP address  for gradio                           |
+### 2. Point cloud render configs
+| Configuration     | Default Value   |   Explanation                                              |
+|:----------------- |:--------------- |:-------------------------------------------------------- |
+| `--radius_scale`  | `1.0`           | Scale factor for the spherical radius                    |
+| `--camera`        | `traj`          | Camera pose type, either 'traj' or 'target'                   |
+| `--mode`          | `gradual`       | Mode of operation, 'gradual', 'bullet', or 'direct'      |
+| `--mask`          | `False`         | Clean the point cloud data if true                       |
+| `--target_pose`   | `None`            | Required for 'target' camera pose type, specifies a relative camera pose sequece (theta, phi, r, x, y). +theta (theta<50) rotates camera upward, +phi (phi<50) rotates camera to right, +r (r<0.6) moves camera forward, +x (x<4) pans the camera to right, +y (y<4) pans the camera upward  |
+| `--traj_txt`      | `None`           | Required for 'traj' camera pose type, a txt file specifying a complex camera trajectory ([examples](../test/trajs/loop1.txt)). The fist line is the theta sequence, the second line the phi sequence, and the last line the r sequence |
+| `--near`          | `0.0001`        | Near clipping plane distance                             |
+| `--far`           | `10000.0`       | Far clipping plane distance                              |
+| `--anchor_idx`    | `0`             | One GT frame for anchor frame                            |

extern/depthcrafter/__init__.py ADDED Viewed

File without changes

extern/depthcrafter/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (151 Bytes). View file

extern/depthcrafter/__pycache__/demo.cpython-310.pyc ADDED Viewed

Binary file (3.86 kB). View file

extern/depthcrafter/__pycache__/depth_crafter_ppl.cpython-310.pyc ADDED Viewed

Binary file (7.88 kB). View file

extern/depthcrafter/__pycache__/infer.cpython-310.pyc ADDED Viewed

Binary file (2.31 kB). View file

extern/depthcrafter/__pycache__/unet.cpython-310.pyc ADDED Viewed

Binary file (2.62 kB). View file

extern/depthcrafter/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (3.29 kB). View file

extern/depthcrafter/depth_crafter_ppl.py ADDED Viewed

	@@ -0,0 +1,366 @@

+from typing import Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import (
+    _resize_with_antialiasing,
+    StableVideoDiffusionPipelineOutput,
+    StableVideoDiffusionPipeline,
+    retrieve_timesteps,
+)
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class DepthCrafterPipeline(StableVideoDiffusionPipeline):
+    @torch.inference_mode()
+    def encode_video(
+        self,
+        video: torch.Tensor,
+        chunk_size: int = 14,
+    ) -> torch.Tensor:
+        """
+        :param video: [b, c, h, w] in range [-1, 1], the b may contain multiple videos or frames
+        :param chunk_size: the chunk size to encode video
+        :return: image_embeddings in shape of [b, 1024]
+        """
+        video_224 = _resize_with_antialiasing(video.float(), (224, 224))
+        video_224 = (video_224 + 1.0) / 2.0  # [-1, 1] -> [0, 1]
+        embeddings = []
+        for i in range(0, video_224.shape[0], chunk_size):
+            tmp = self.feature_extractor(
+                images=video_224[i : i + chunk_size],
+                do_normalize=True,
+                do_center_crop=False,
+                do_resize=False,
+                do_rescale=False,
+                return_tensors="pt",
+            ).pixel_values.to(video.device, dtype=video.dtype)
+            embeddings.append(self.image_encoder(tmp).image_embeds)  # [b, 1024]
+        embeddings = torch.cat(embeddings, dim=0)  # [t, 1024]
+        return embeddings
+    @torch.inference_mode()
+    def encode_vae_video(
+        self,
+        video: torch.Tensor,
+        chunk_size: int = 14,
+    ):
+        """
+        :param video: [b, c, h, w] in range [-1, 1], the b may contain multiple videos or frames
+        :param chunk_size: the chunk size to encode video
+        :return: vae latents in shape of [b, c, h, w]
+        """
+        video_latents = []
+        for i in range(0, video.shape[0], chunk_size):
+            video_latents.append(
+                self.vae.encode(video[i : i + chunk_size]).latent_dist.mode()
+            )
+        video_latents = torch.cat(video_latents, dim=0)
+        return video_latents
+    @staticmethod
+    def check_inputs(video, height, width):
+        """
+        :param video:
+        :param height:
+        :param width:
+        :return:
+        """
+        if not isinstance(video, torch.Tensor) and not isinstance(video, np.ndarray):
+            raise ValueError(
+                f"Expected `video` to be a `torch.Tensor` or `VideoReader`, but got a {type(video)}"
+            )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+    @torch.no_grad()
+    def __call__(
+        self,
+        video: Union[np.ndarray, torch.Tensor],
+        height: int = 576,
+        width: int = 1024,
+        num_inference_steps: int = 25,
+        guidance_scale: float = 1.0,
+        window_size: Optional[int] = 110,
+        noise_aug_strength: float = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        return_dict: bool = True,
+        overlap: int = 25,
+        track_time: bool = False,
+    ):
+        """
+        :param video: in shape [t, h, w, c] if np.ndarray or [t, c, h, w] if torch.Tensor, in range [0, 1]
+        :param height:
+        :param width:
+        :param num_inference_steps:
+        :param guidance_scale:
+        :param window_size: sliding window processing size
+        :param fps:
+        :param motion_bucket_id:
+        :param noise_aug_strength:
+        :param decode_chunk_size:
+        :param generator:
+        :param latents:
+        :param output_type:
+        :param callback_on_step_end:
+        :param callback_on_step_end_tensor_inputs:
+        :param return_dict:
+        :return:
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        num_frames = video.shape[0]
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else 8
+        if num_frames <= window_size:
+            window_size = num_frames
+            overlap = 0
+        stride = window_size - overlap
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(video, height, width)
+        # 2. Define call parameters
+        batch_size = 1
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = guidance_scale
+        # 3. Encode input video
+        if isinstance(video, np.ndarray):
+            video = torch.from_numpy(video.transpose(0, 3, 1, 2))
+        else:
+            assert isinstance(video, torch.Tensor)
+        video = video.to(device=device, dtype=self.dtype)
+        video = video * 2.0 - 1.0  # [0,1] -> [-1,1], in [t, c, h, w]
+        if track_time:
+            start_event = torch.cuda.Event(enable_timing=True)
+            encode_event = torch.cuda.Event(enable_timing=True)
+            denoise_event = torch.cuda.Event(enable_timing=True)
+            decode_event = torch.cuda.Event(enable_timing=True)
+            start_event.record()
+        video_embeddings = self.encode_video(
+            video, chunk_size=decode_chunk_size
+        ).unsqueeze(
+            0
+        )  # [1, t, 1024]
+        torch.cuda.empty_cache()
+        # 4. Encode input image using VAE
+        noise = randn_tensor(
+            video.shape, generator=generator, device=device, dtype=video.dtype
+        )
+        video = video + noise_aug_strength * noise  # in [t, c, h, w]
+        # pdb.set_trace()
+        needs_upcasting = (
+            self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        )
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+        video_latents = self.encode_vae_video(
+            video.to(self.vae.dtype),
+            chunk_size=decode_chunk_size,
+        ).unsqueeze(
+            0
+        )  # [1, t, c, h, w]
+        if track_time:
+            encode_event.record()
+            torch.cuda.synchronize()
+            elapsed_time_ms = start_event.elapsed_time(encode_event)
+            print(f"Elapsed time for encoding video: {elapsed_time_ms} ms")
+        torch.cuda.empty_cache()
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            7,
+            127,
+            noise_aug_strength,
+            video_embeddings.dtype,
+            batch_size,
+            1,
+            False,
+        )  # [1 or 2, 3]
+        added_time_ids = added_time_ids.to(device)
+        # 6. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, None, None
+        )
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        # 7. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents_init = self.prepare_latents(
+            batch_size,
+            window_size,
+            num_channels_latents,
+            height,
+            width,
+            video_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )  # [1, t, c, h, w]
+        latents_all = None
+        idx_start = 0
+        if overlap > 0:
+            weights = torch.linspace(0, 1, overlap, device=device)
+            weights = weights.view(1, overlap, 1, 1, 1)
+        else:
+            weights = None
+        torch.cuda.empty_cache()
+        # inference strategy for long videos
+        # two main strategies: 1. noise init from previous frame, 2. segments stitching
+        while idx_start < num_frames - overlap:
+            idx_end = min(idx_start + window_size, num_frames)
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            # 9. Denoising loop
+            latents = latents_init[:, : idx_end - idx_start].clone()
+            latents_init = torch.cat(
+                [latents_init[:, -overlap:], latents_init[:, :stride]], dim=1
+            )
+            video_latents_current = video_latents[:, idx_start:idx_end]
+            video_embeddings_current = video_embeddings[:, idx_start:idx_end]
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if latents_all is not None and i == 0:
+                        latents[:, :overlap] = (
+                            latents_all[:, -overlap:]
+                            + latents[:, :overlap]
+                            / self.scheduler.init_noise_sigma
+                            * self.scheduler.sigmas[i]
+                        )
+                    latent_model_input = latents  # [1, t, c, h, w]
+                    latent_model_input = self.scheduler.scale_model_input(
+                        latent_model_input, t
+                    )  # [1, t, c, h, w]
+                    latent_model_input = torch.cat(
+                        [latent_model_input, video_latents_current], dim=2
+                    )
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=video_embeddings_current,
+                        added_time_ids=added_time_ids,
+                        return_dict=False,
+                    )[0]
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        latent_model_input = latents
+                        latent_model_input = self.scheduler.scale_model_input(
+                            latent_model_input, t
+                        )
+                        latent_model_input = torch.cat(
+                            [latent_model_input, torch.zeros_like(latent_model_input)],
+                            dim=2,
+                        )
+                        noise_pred_uncond = self.unet(
+                            latent_model_input,
+                            t,
+                            encoder_hidden_states=torch.zeros_like(
+                                video_embeddings_current
+                            ),
+                            added_time_ids=added_time_ids,
+                            return_dict=False,
+                        )[0]
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (
+                            noise_pred - noise_pred_uncond
+                        )
+                    latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(
+                            self, i, t, callback_kwargs
+                        )
+                        latents = callback_outputs.pop("latents", latents)
+                    if i == len(timesteps) - 1 or (
+                        (i + 1) > num_warmup_steps
+                        and (i + 1) % self.scheduler.order == 0
+                    ):
+                        progress_bar.update()
+            if latents_all is None:
+                latents_all = latents.clone()
+            else:
+                assert weights is not None
+                # latents_all[:, -overlap:] = (
+                #     latents[:, :overlap] + latents_all[:, -overlap:]
+                # ) / 2.0
+                latents_all[:, -overlap:] = latents[
+                    :, :overlap
+                ] * weights + latents_all[:, -overlap:] * (1 - weights)
+                latents_all = torch.cat([latents_all, latents[:, overlap:]], dim=1)
+            idx_start += stride
+        if track_time:
+            denoise_event.record()
+            torch.cuda.synchronize()
+            elapsed_time_ms = encode_event.elapsed_time(denoise_event)
+            print(f"Elapsed time for denoising video: {elapsed_time_ms} ms")
+        if not output_type == "latent":
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+            frames = self.decode_latents(latents_all, num_frames, decode_chunk_size)
+            if track_time:
+                decode_event.record()
+                torch.cuda.synchronize()
+                elapsed_time_ms = denoise_event.elapsed_time(decode_event)
+                print(f"Elapsed time for decoding video: {elapsed_time_ms} ms")
+            frames = self.video_processor.postprocess_video(
+                video=frames, output_type=output_type
+            )
+        else:
+            frames = latents_all
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return frames
+        return StableVideoDiffusionPipelineOutput(frames=frames)

extern/depthcrafter/infer.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import gc
+import os
+import numpy as np
+import torch
+from diffusers.training_utils import set_seed
+from extern.depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
+from extern.depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
+class DepthCrafterDemo:
+    def __init__(
+        self,
+        unet_path: str,
+        pre_train_path: str,
+        cpu_offload: str = "model",
+        device: str = "cuda:0"
+    ):
+        unet = DiffusersUNetSpatioTemporalConditionModelDepthCrafter.from_pretrained(
+            unet_path,
+            low_cpu_mem_usage=True,
+            torch_dtype=torch.float16,
+        )
+        # load weights of other components from the provided checkpoint
+        self.pipe = DepthCrafterPipeline.from_pretrained(
+            pre_train_path,
+            unet=unet,
+            torch_dtype=torch.float16,
+            variant="fp16",
+        )
+        # for saving memory, we can offload the model to CPU, or even run the model sequentially to save more memory
+        if cpu_offload is not None:
+            if cpu_offload == "sequential":
+                # This will slow, but save more memory
+                self.pipe.enable_sequential_cpu_offload()
+            elif cpu_offload == "model":
+                self.pipe.enable_model_cpu_offload()
+            else:
+                raise ValueError(f"Unknown cpu offload option: {cpu_offload}")
+        else:
+            self.pipe.to(device)
+        # enable attention slicing and xformers memory efficient attention
+        try:
+            self.pipe.enable_xformers_memory_efficient_attention()
+        except Exception as e:
+            print(e)
+            print("Xformers is not enabled")
+        self.pipe.enable_attention_slicing()
+    def infer(
+        self,
+        frames,
+        near,
+        far,
+        num_denoising_steps: int,
+        guidance_scale: float,
+        window_size: int = 110,
+        overlap: int = 25,
+        seed: int = 42,
+        track_time: bool = True,
+    ):
+        set_seed(seed)
+        # inference the depth map using the DepthCrafter pipeline
+        with torch.inference_mode():
+            res = self.pipe(
+                frames,
+                height=frames.shape[1],
+                width=frames.shape[2],
+                output_type="np",
+                guidance_scale=guidance_scale,
+                num_inference_steps=num_denoising_steps,
+                window_size=window_size,
+                overlap=overlap,
+                track_time=track_time,
+            ).frames[0]
+        # convert the three-channel output to a single channel depth map
+        res = res.sum(-1) / res.shape[-1]
+        # normalize the depth map to [0, 1] across the whole video
+        depths = (res - res.min()) / (res.max() - res.min())
+        # visualize the depth map and save the results
+        # vis = vis_sequence_depth(res)
+        # save the depth map and visualization with the target FPS
+        depths = torch.from_numpy(depths).unsqueeze(1) # 49 576 1024 ->
+        depths *= 3900  # compatible with da output
+        depths[depths < 1e-5] = 1e-5
+        depths = 10000. / depths
+        depths = depths.clip(near, far)
+        return depths

extern/depthcrafter/unet.py ADDED Viewed

	@@ -0,0 +1,142 @@

+from typing import Union, Tuple
+import torch
+from diffusers import UNetSpatioTemporalConditionModel
+from diffusers.models.unets.unet_spatio_temporal_condition import UNetSpatioTemporalConditionOutput
+class DiffusersUNetSpatioTemporalConditionModelDepthCrafter(
+    UNetSpatioTemporalConditionModel
+):
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        added_time_ids: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[UNetSpatioTemporalConditionOutput, Tuple]:
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        batch_size, num_frames = sample.shape[:2]
+        timesteps = timesteps.expand(batch_size)
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.conv_in.weight.dtype)
+        emb = self.time_embedding(t_emb)  # [batch_size * num_frames, channels]
+        time_embeds = self.add_time_proj(added_time_ids.flatten())
+        time_embeds = time_embeds.reshape((batch_size, -1))
+        time_embeds = time_embeds.to(emb.dtype)
+        aug_emb = self.add_embedding(time_embeds)
+        emb = emb + aug_emb
+        # Flatten the batch and frames dimensions
+        # sample: [batch, frames, channels, height, width] -> [batch * frames, channels, height, width]
+        sample = sample.flatten(0, 1)
+        # Repeat the embeddings num_video_frames times
+        # emb: [batch, channels] -> [batch * frames, channels]
+        emb = emb.repeat_interleave(num_frames, dim=0)
+        # encoder_hidden_states: [batch, frames, channels] -> [batch * frames, 1, channels]
+        encoder_hidden_states = encoder_hidden_states.flatten(0, 1).unsqueeze(1)
+        # 2. pre-process
+        sample = sample.to(dtype=self.conv_in.weight.dtype)
+        assert sample.dtype == self.conv_in.weight.dtype, (
+            f"sample.dtype: {sample.dtype}, "
+            f"self.conv_in.weight.dtype: {self.conv_in.weight.dtype}"
+        )
+        sample = self.conv_in(sample)
+        image_only_indicator = torch.zeros(
+            batch_size, num_frames, dtype=sample.dtype, device=sample.device
+        )
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if (
+                hasattr(downsample_block, "has_cross_attention")
+                and downsample_block.has_cross_attention
+            ):
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    image_only_indicator=image_only_indicator,
+                )
+            down_block_res_samples += res_samples
+        # 4. mid
+        sample = self.mid_block(
+            hidden_states=sample,
+            temb=emb,
+            encoder_hidden_states=encoder_hidden_states,
+            image_only_indicator=image_only_indicator,
+        )
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[
+                : -len(upsample_block.resnets)
+            ]
+            if (
+                hasattr(upsample_block, "has_cross_attention")
+                and upsample_block.has_cross_attention
+            ):
+                sample = upsample_block(
+                    hidden_states=sample,
+                    res_hidden_states_tuple=res_samples,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    res_hidden_states_tuple=res_samples,
+                    temb=emb,
+                    image_only_indicator=image_only_indicator,
+                )
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        # 7. Reshape back to original shape
+        sample = sample.reshape(batch_size, num_frames, *sample.shape[1:])
+        if not return_dict:
+            return (sample,)
+        return UNetSpatioTemporalConditionOutput(sample=sample)

extern/video_depth_anything/__pycache__/dinov2.cpython-310.pyc ADDED Viewed

Binary file (12.2 kB). View file

extern/video_depth_anything/__pycache__/dpt.cpython-310.pyc ADDED Viewed

Binary file (3.64 kB). View file

extern/video_depth_anything/__pycache__/dpt_temporal.cpython-310.pyc ADDED Viewed

Binary file (2.76 kB). View file

extern/video_depth_anything/__pycache__/vdademo.cpython-310.pyc ADDED Viewed

Binary file (1.52 kB). View file

extern/video_depth_anything/__pycache__/video_depth.cpython-310.pyc ADDED Viewed

Binary file (4.66 kB). View file

extern/video_depth_anything/dinov2.py ADDED Viewed

	@@ -0,0 +1,415 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+from .dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+logger = logging.getLogger("dinov2")
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
+        w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        sqrt_N = math.sqrt(N)
+        sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
+            scale_factor=(sx, sy),
+            # (int(w0), int(h0)), # to solve the upsampling shape issue
+            mode="bicubic",
+            antialias=self.interpolate_antialias
+        )
+        assert int(w0) == patch_pos_embed.shape[-2]
+        assert int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def DINOv2(model_name):
+    model_zoo = {
+        "vits": vit_small,
+        "vitb": vit_base,
+        "vitl": vit_large,
+        "vitg": vit_giant2
+    }
+    return model_zoo[model_name](
+        img_size=518,
+        patch_size=14,
+        init_values=1.0,
+        ffn_layer="mlp" if model_name != "vitg" else "swiglufused",
+        block_chunks=0,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1
+    )

extern/video_depth_anything/dinov2_layers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention

extern/video_depth_anything/dinov2_layers/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (415 Bytes). View file

extern/video_depth_anything/dinov2_layers/__pycache__/attention.cpython-310.pyc ADDED Viewed

Binary file (2.38 kB). View file

extern/video_depth_anything/dinov2_layers/__pycache__/block.cpython-310.pyc ADDED Viewed

Binary file (7.99 kB). View file

extern/video_depth_anything/dinov2_layers/__pycache__/drop_path.cpython-310.pyc ADDED Viewed

Binary file (1.22 kB). View file

extern/video_depth_anything/dinov2_layers/__pycache__/layer_scale.cpython-310.pyc ADDED Viewed

Binary file (1.02 kB). View file

extern/video_depth_anything/dinov2_layers/__pycache__/mlp.cpython-310.pyc ADDED Viewed

Binary file (1.21 kB). View file

extern/video_depth_anything/dinov2_layers/__pycache__/patch_embed.cpython-310.pyc ADDED Viewed

Binary file (2.66 kB). View file

extern/video_depth_anything/dinov2_layers/__pycache__/swiglu_ffn.cpython-310.pyc ADDED Viewed

Binary file (2.01 kB). View file

extern/video_depth_anything/dinov2_layers/attention.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+from torch import Tensor
+from torch import nn
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import memory_efficient_attention, unbind, fmha
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            assert attn_bias is None, "xFormers is required for nested tensors usage"
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

extern/video_depth_anything/dinov2_layers/block.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+from typing import Callable, List, Any, Tuple, Dict
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import fmha
+    from xformers.ops import scaled_index_add, index_select_cat
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

extern/video_depth_anything/dinov2_layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

extern/video_depth_anything/dinov2_layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

extern/video_depth_anything/dinov2_layers/mlp.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

extern/video_depth_anything/dinov2_layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+import torch.nn as nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

extern/video_depth_anything/dinov2_layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Callable, Optional
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+try:
+    from xformers.ops import SwiGLU
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

extern/video_depth_anything/dpt.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# Copyright (2025) Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .util.blocks import FeatureFusionBlock, _make_scratch
+def _make_fusion_block(features, use_bn, size=None):
+    return FeatureFusionBlock(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        size=size,
+    )
+class ConvBlock(nn.Module):
+    def __init__(self, in_feature, out_feature):
+        super().__init__()
+        self.conv_block = nn.Sequential(
+            nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(out_feature),
+            nn.ReLU(True)
+        )
+    def forward(self, x):
+        return self.conv_block(x)
+class DPTHead(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        features=256,
+        use_bn=False,
+        out_channels=[256, 512, 1024, 1024],
+        use_clstoken=False
+    ):
+        super(DPTHead, self).__init__()
+        self.use_clstoken = use_clstoken
+        self.projects = nn.ModuleList([
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channel,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ) for out_channel in out_channels
+        ])
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0],
+                out_channels=out_channels[0],
+                kernel_size=4,
+                stride=4,
+                padding=0),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1],
+                out_channels=out_channels[1],
+                kernel_size=2,
+                stride=2,
+                padding=0),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3],
+                out_channels=out_channels[3],
+                kernel_size=3,
+                stride=2,
+                padding=1)
+        ])
+        if use_clstoken:
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(
+                        nn.Linear(2 * in_channels, in_channels),
+                        nn.GELU()))
+        self.scratch = _make_scratch(
+            out_channels,
+            features,
+            groups=1,
+            expand=False,
+        )
+        self.scratch.stem_transpose = None
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+        head_features_1 = features
+        head_features_2 = 32
+        self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
+        self.scratch.output_conv2 = nn.Sequential(
+            nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True),
+            nn.Identity(),
+        )
+    def forward(self, out_features, patch_h, patch_w):
+        out = []
+        for i, x in enumerate(out_features):
+            if self.use_clstoken:
+                x, cls_token = x[0], x[1]
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+            else:
+                x = x[0]
+            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        layer_1, layer_2, layer_3, layer_4 = out
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv1(path_1)
+        out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)
+        out = self.scratch.output_conv2(out)
+        return out

extern/video_depth_anything/dpt_temporal.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (2025) Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from .dpt import DPTHead
+from .motion_module.motion_module import TemporalModule
+from easydict import EasyDict
+class DPTHeadTemporal(DPTHead):
+    def __init__(self,
+        in_channels,
+        features=256,
+        use_bn=False,
+        out_channels=[256, 512, 1024, 1024],
+        use_clstoken=False,
+        num_frames=32,
+        pe='ape'
+    ):
+        super().__init__(in_channels, features, use_bn, out_channels, use_clstoken)
+        assert num_frames > 0
+        motion_module_kwargs = EasyDict(num_attention_heads                = 8,
+                                        num_transformer_block              = 1,
+                                        num_attention_blocks               = 2,
+                                        temporal_max_len                   = num_frames,
+                                        zero_initialize                    = True,
+                                        pos_embedding_type                 = pe)
+        self.motion_modules = nn.ModuleList([
+            TemporalModule(in_channels=out_channels[2],
+                           **motion_module_kwargs),
+            TemporalModule(in_channels=out_channels[3],
+                           **motion_module_kwargs),
+            TemporalModule(in_channels=features,
+                           **motion_module_kwargs),
+            TemporalModule(in_channels=features,
+                           **motion_module_kwargs)
+        ])
+    def forward(self, out_features, patch_h, patch_w, frame_length):
+        out = []
+        for i, x in enumerate(out_features):
+            if self.use_clstoken:
+                x, cls_token = x[0], x[1]
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+            else:
+                x = x[0]
+            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w)).contiguous()
+            B, T = x.shape[0] // frame_length, frame_length
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        layer_1, layer_2, layer_3, layer_4 = out
+        B, T = layer_1.shape[0] // frame_length, frame_length
+        layer_3 = self.motion_modules[0](layer_3.unflatten(0, (B, T)).permute(0, 2, 1, 3, 4), None, None).permute(0, 2, 1, 3, 4).flatten(0, 1)
+        layer_4 = self.motion_modules[1](layer_4.unflatten(0, (B, T)).permute(0, 2, 1, 3, 4), None, None).permute(0, 2, 1, 3, 4).flatten(0, 1)
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+        path_4 = self.motion_modules[2](path_4.unflatten(0, (B, T)).permute(0, 2, 1, 3, 4), None, None).permute(0, 2, 1, 3, 4).flatten(0, 1)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
+        path_3 = self.motion_modules[3](path_3.unflatten(0, (B, T)).permute(0, 2, 1, 3, 4), None, None).permute(0, 2, 1, 3, 4).flatten(0, 1)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv1(path_1)
+        out = F.interpolate(
+            out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True
+        )
+        out = self.scratch.output_conv2(out)
+        return out

extern/video_depth_anything/motion_module/__pycache__/attention.cpython-310.pyc ADDED Viewed

Binary file (12.1 kB). View file

extern/video_depth_anything/motion_module/__pycache__/motion_module.cpython-310.pyc ADDED Viewed

Binary file (7.39 kB). View file

extern/video_depth_anything/motion_module/attention.py ADDED Viewed

	@@ -0,0 +1,429 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple
+import torch
+import torch.nn.functional as F
+from torch import nn
+try:
+    import xformers
+    import xformers.ops
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    print("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class CrossAttention(nn.Module):
+    r"""
+    A cross attention layer.
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias=False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        added_kv_proj_dim: Optional[int] = None,
+        norm_num_groups: Optional[int] = None,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.upcast_efficient_attention = False
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+        self._slice_size = None
+        self._use_memory_efficient_attention_xformers = False
+        self.added_kv_proj_dim = added_kv_proj_dim
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(num_channels=inner_dim, num_groups=norm_num_groups, eps=1e-5, affine=True)
+        else:
+            self.group_norm = None
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
+        self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, cross_attention_dim)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, cross_attention_dim)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(inner_dim, query_dim))
+        self.to_out.append(nn.Dropout(dropout))
+    def reshape_heads_to_batch_dim(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size).contiguous()
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size).contiguous()
+        return tensor
+    def reshape_heads_to_4d(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size).contiguous()
+        return tensor
+    def reshape_batch_dim_to_heads(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim).contiguous()
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size).contiguous()
+        return tensor
+    def reshape_4d_to_heads(self, tensor):
+        batch_size, seq_len, head_size, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len, dim * head_size).contiguous()
+        return tensor
+    def set_attention_slice(self, slice_size):
+        if slice_size is not None and slice_size > self.sliceable_head_dim:
+            raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
+        self._slice_size = slice_size
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        encoder_hidden_states = encoder_hidden_states
+        if self.group_norm is not None:
+            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = self.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = self.reshape_heads_to_batch_dim(query)
+        if self.added_kv_proj_dim is not None:
+            key = self.to_k(hidden_states)
+            value = self.to_v(hidden_states)
+            encoder_hidden_states_key_proj = self.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = self.add_v_proj(encoder_hidden_states)
+            key = self.reshape_heads_to_batch_dim(key)
+            value = self.reshape_heads_to_batch_dim(value)
+            encoder_hidden_states_key_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_key_proj)
+            encoder_hidden_states_value_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_value_proj)
+            key = torch.concat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.concat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+            key = self.to_k(encoder_hidden_states)
+            value = self.to_v(encoder_hidden_states)
+            key = self.reshape_heads_to_batch_dim(key)
+            value = self.reshape_heads_to_batch_dim(value)
+        if attention_mask is not None:
+            if attention_mask.shape[-1] != query.shape[1]:
+                target_length = query.shape[1]
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)
+        # attention, what we cannot get enough of
+        if XFORMERS_AVAILABLE and self._use_memory_efficient_attention_xformers:
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
+            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+                hidden_states = self._attention(query, key, value, attention_mask)
+            else:
+                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        return hidden_states
+    def _attention(self, query, key, value, attention_mask=None):
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+        attention_scores = torch.baddbmm(
+            torch.empty(query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device),
+            query,
+            key.transpose(-1, -2),
+            beta=0,
+            alpha=self.scale,
+        )
+        if attention_mask is not None:
+            attention_scores = attention_scores + attention_mask
+        if self.upcast_softmax:
+            attention_scores = attention_scores.float()
+        attention_probs = attention_scores.softmax(dim=-1)
+        # cast back to the original dtype
+        attention_probs = attention_probs.to(value.dtype)
+        # compute attention output
+        hidden_states = torch.bmm(attention_probs, value)
+        # reshape hidden_states
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+    def _sliced_attention(self, query, key, value, sequence_length, dim, attention_mask):
+        batch_size_attention = query.shape[0]
+        hidden_states = torch.zeros(
+            (batch_size_attention, sequence_length, dim // self.heads), device=query.device, dtype=query.dtype
+        )
+        slice_size = self._slice_size if self._slice_size is not None else hidden_states.shape[0]
+        for i in range(hidden_states.shape[0] // slice_size):
+            start_idx = i * slice_size
+            end_idx = (i + 1) * slice_size
+            query_slice = query[start_idx:end_idx]
+            key_slice = key[start_idx:end_idx]
+            if self.upcast_attention:
+                query_slice = query_slice.float()
+                key_slice = key_slice.float()
+            attn_slice = torch.baddbmm(
+                torch.empty(slice_size, query.shape[1], key.shape[1], dtype=query_slice.dtype, device=query.device),
+                query_slice,
+                key_slice.transpose(-1, -2),
+                beta=0,
+                alpha=self.scale,
+            )
+            if attention_mask is not None:
+                attn_slice = attn_slice + attention_mask[start_idx:end_idx]
+            if self.upcast_softmax:
+                attn_slice = attn_slice.float()
+            attn_slice = attn_slice.softmax(dim=-1)
+            # cast back to the original dtype
+            attn_slice = attn_slice.to(value.dtype)
+            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+            hidden_states[start_idx:end_idx] = attn_slice
+        # reshape hidden_states
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+    def _memory_efficient_attention_xformers(self, query, key, value, attention_mask):
+        if self.upcast_efficient_attention:
+            org_dtype = query.dtype
+            query = query.float()
+            key = key.float()
+            value = value.float()
+            if attention_mask is not None:
+                attention_mask = attention_mask.float()
+        hidden_states = self._memory_efficient_attention_split(query, key, value, attention_mask)
+        if self.upcast_efficient_attention:
+            hidden_states = hidden_states.to(org_dtype)
+        hidden_states = self.reshape_4d_to_heads(hidden_states)
+        return hidden_states
+        # print("Errror: no xformers")
+        # raise NotImplementedError
+    def _memory_efficient_attention_split(self, query, key, value, attention_mask):
+        batch_size = query.shape[0]
+        max_batch_size = 65535
+        num_batches = (batch_size + max_batch_size - 1) // max_batch_size
+        results = []
+        for i in range(num_batches):
+            start_idx = i * max_batch_size
+            end_idx = min((i + 1) * max_batch_size, batch_size)
+            query_batch = query[start_idx:end_idx]
+            key_batch = key[start_idx:end_idx]
+            value_batch = value[start_idx:end_idx]
+            if attention_mask is not None:
+                attention_mask_batch = attention_mask[start_idx:end_idx]
+            else:
+                attention_mask_batch = None
+            result = xformers.ops.memory_efficient_attention(query_batch, key_batch, value_batch, attn_bias=attention_mask_batch)
+            results.append(result)
+        full_result = torch.cat(results, dim=0)
+        return full_result
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out))
+    def forward(self, hidden_states):
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+class GELU(nn.Module):
+    r"""
+    GELU activation function
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+    def gelu(self, gate):
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+# feedforward
+class GEGLU(nn.Module):
+    r"""
+    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def gelu(self, gate):
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+class ApproximateGELU(nn.Module):
+    """
+    The approximate form of Gaussian Error Linear Unit (GELU)
+    For more details, see section 2: https://arxiv.org/abs/1606.08415
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+    def forward(self, x):
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device, dtype=torch.float32)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2).contiguous())
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2).contiguous())
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(2)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(2)
+    return xq_out.type_as(xq), xk_out.type_as(xk)

extern/video_depth_anything/motion_module/motion_module.py ADDED Viewed

	@@ -0,0 +1,297 @@

+# This file is originally from AnimateDiff/animatediff/models/motion_module.py at main · guoyww/AnimateDiff
+# SPDX-License-Identifier: Apache-2.0 license
+#
+# This file may have been modified by ByteDance Ltd. and/or its affiliates on [date of modification]
+# Original file was released under [ Apache-2.0 license], with the full license text available at [https://github.com/guoyww/AnimateDiff?tab=Apache-2.0-1-ov-file#readme].
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .attention import CrossAttention, FeedForward, apply_rotary_emb, precompute_freqs_cis
+from einops import rearrange, repeat
+import math
+try:
+    import xformers
+    import xformers.ops
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    print("xFormers not available")
+    XFORMERS_AVAILABLE = False
+def zero_module(module):
+    # Zero out the parameters of a module and return it.
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+class TemporalModule(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        num_attention_heads                = 8,
+        num_transformer_block              = 2,
+        num_attention_blocks               = 2,
+        norm_num_groups                    = 32,
+        temporal_max_len                   = 32,
+        zero_initialize                    = True,
+        pos_embedding_type                 = "ape",
+    ):
+        super().__init__()
+        self.temporal_transformer = TemporalTransformer3DModel(
+            in_channels=in_channels,
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=in_channels // num_attention_heads,
+            num_layers=num_transformer_block,
+            num_attention_blocks=num_attention_blocks,
+            norm_num_groups=norm_num_groups,
+            temporal_max_len=temporal_max_len,
+            pos_embedding_type=pos_embedding_type,
+        )
+        if zero_initialize:
+            self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)
+    def forward(self, input_tensor, encoder_hidden_states, attention_mask=None):
+        hidden_states = input_tensor
+        hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask)
+        output = hidden_states
+        return output
+class TemporalTransformer3DModel(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        num_attention_heads,
+        attention_head_dim,
+        num_layers,
+        num_attention_blocks               = 2,
+        norm_num_groups                    = 32,
+        temporal_max_len                   = 32,
+        pos_embedding_type                 = "ape",
+    ):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                TemporalTransformerBlock(
+                    dim=inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    num_attention_blocks=num_attention_blocks,
+                    temporal_max_len=temporal_max_len,
+                    pos_embedding_type=pos_embedding_type,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+        video_length = hidden_states.shape[2]
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+        batch, channel, height, width = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        inner_dim = hidden_states.shape[1]
+        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim).contiguous()
+        hidden_states = self.proj_in(hidden_states)
+        # Transformer Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states, encoder_hidden_states=encoder_hidden_states, video_length=video_length, attention_mask=attention_mask)
+        # output
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+        output = hidden_states + residual
+        output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
+        return output
+class TemporalTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_attention_heads,
+        attention_head_dim,
+        num_attention_blocks               = 2,
+        temporal_max_len                   = 32,
+        pos_embedding_type                 = "ape",
+    ):
+        super().__init__()
+        self.attention_blocks = nn.ModuleList(
+            [
+                TemporalAttention(
+                        query_dim=dim,
+                        heads=num_attention_heads,
+                        dim_head=attention_head_dim,
+                        temporal_max_len=temporal_max_len,
+                        pos_embedding_type=pos_embedding_type,
+                )
+                for i in range(num_attention_blocks)
+            ]
+        )
+        self.norms = nn.ModuleList(
+            [
+                nn.LayerNorm(dim)
+                for i in range(num_attention_blocks)
+            ]
+        )
+        self.ff = FeedForward(dim, dropout=0.0, activation_fn="geglu")
+        self.ff_norm = nn.LayerNorm(dim)
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
+        for attention_block, norm in zip(self.attention_blocks, self.norms):
+            norm_hidden_states = norm(hidden_states)
+            hidden_states = attention_block(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                video_length=video_length,
+                attention_mask=attention_mask,
+            ) + hidden_states
+        hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states
+        output = hidden_states
+        return output
+class PositionalEncoding(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        dropout = 0.,
+        max_len = 32
+    ):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
+        pe = torch.zeros(1, max_len, d_model)
+        pe[0, :, 0::2] = torch.sin(position * div_term)
+        pe[0, :, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1)].to(x.dtype)
+        return self.dropout(x)
+class TemporalAttention(CrossAttention):
+    def __init__(
+            self,
+            temporal_max_len                   = 32,
+            pos_embedding_type                 = "ape",
+            *args, **kwargs
+        ):
+        super().__init__(*args, **kwargs)
+        self.pos_embedding_type = pos_embedding_type
+        self._use_memory_efficient_attention_xformers = True
+        self.pos_encoder = None
+        self.freqs_cis = None
+        if self.pos_embedding_type == "ape":
+            self.pos_encoder = PositionalEncoding(
+                kwargs["query_dim"],
+                dropout=0.,
+                max_len=temporal_max_len
+            )
+        elif self.pos_embedding_type == "rope":
+            self.freqs_cis = precompute_freqs_cis(
+                kwargs["query_dim"],
+                temporal_max_len
+            )
+        else:
+            raise NotImplementedError
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
+        d = hidden_states.shape[1]
+        hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
+        if self.pos_encoder is not None:
+            hidden_states = self.pos_encoder(hidden_states)
+        encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b d) n c", d=d) if encoder_hidden_states is not None else encoder_hidden_states
+        if self.group_norm is not None:
+            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = self.to_q(hidden_states)
+        dim = query.shape[-1]
+        if self.added_kv_proj_dim is not None:
+            raise NotImplementedError
+        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        key = self.to_k(encoder_hidden_states)
+        value = self.to_v(encoder_hidden_states)
+        if self.freqs_cis is not None:
+            seq_len = query.shape[1]
+            freqs_cis = self.freqs_cis[:seq_len].to(query.device)
+            query, key = apply_rotary_emb(query, key, freqs_cis)
+        if attention_mask is not None:
+            if attention_mask.shape[-1] != query.shape[1]:
+                target_length = query.shape[1]
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)
+        use_memory_efficient = XFORMERS_AVAILABLE and self._use_memory_efficient_attention_xformers
+        if use_memory_efficient and (dim // self.heads) % 8 != 0:
+            # print('Warning: the dim {} cannot be divided by 8. Fall into normal attention'.format(dim // self.heads))
+            use_memory_efficient = False
+        # attention, what we cannot get enough of
+        if use_memory_efficient:
+            query = self.reshape_heads_to_4d(query)
+            key = self.reshape_heads_to_4d(key)
+            value = self.reshape_heads_to_4d(value)
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
+            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            query = self.reshape_heads_to_batch_dim(query)
+            key = self.reshape_heads_to_batch_dim(key)
+            value = self.reshape_heads_to_batch_dim(value)
+            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+                hidden_states = self._attention(query, key, value, attention_mask)
+            else:
+                raise NotImplementedError
+                # hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
+        return hidden_states

extern/video_depth_anything/util/__pycache__/blocks.cpython-310.pyc ADDED Viewed

Binary file (3.24 kB). View file

extern/video_depth_anything/util/__pycache__/transform.cpython-310.pyc ADDED Viewed

Binary file (4.72 kB). View file

extern/video_depth_anything/util/__pycache__/util.cpython-310.pyc ADDED Viewed

Binary file (1.66 kB). View file

extern/video_depth_anything/util/blocks.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import torch.nn as nn
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    if len(in_shape) >= 4:
+        out_shape4 = out_shape
+    if expand:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        if len(in_shape) >= 4:
+            out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    if len(in_shape) >= 4:
+        scratch.layer4_rn = nn.Conv2d(
+            in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+        )
+    return scratch
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module."""
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups = 1
+        self.conv1 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
+        )
+        self.conv2 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
+        )
+        if self.bn is True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn is True:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn is True:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block."""
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        size=None,
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups = 1
+        self.expand = expand
+        out_features = features
+        if self.expand is True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(
+            features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1
+        )
+        self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
+        self.skip_add = nn.quantized.FloatFunctional()
+        self.size = size
+    def forward(self, *xs, size=None):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+        output = self.resConfUnit2(output)
+        if (size is None) and (self.size is None):
+            modifier = {"scale_factor": 2}
+        elif size is None:
+            modifier = {"size": self.size}
+        else:
+            modifier = {"size": size}
+        output = nn.functional.interpolate(
+            output.contiguous(), **modifier, mode="bilinear", align_corners=self.align_corners
+        )
+        output = self.out_conv(output)
+        return output

extern/video_depth_anything/util/transform.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import numpy as np
+import cv2
+class Resize(object):
+    """Resize sample to given size (width, height).
+    """
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+        image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width)
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width)
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        return (new_width, new_height)
+    def __call__(self, sample):
+        width, height = self.get_size(sample["image"].shape[1], sample["image"].shape[0])
+        # resize sample
+        sample["image"] = cv2.resize(sample["image"], (width, height), interpolation=self.__image_interpolation_method)
+        if self.__resize_target:
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST)
+            if "mask" in sample:
+                sample["mask"] = cv2.resize(sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST)
+        return sample
+class NormalizeImage(object):
+    """Normlize image by given mean and std.
+    """
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+    def __call__(self, sample):
+        sample["image"] = (sample["image"] - self.__mean) / self.__std
+        return sample
+class PrepareForNet(object):
+    """Prepare sample for usage as network input.
+    """
+    def __init__(self):
+        pass
+    def __call__(self, sample):
+        image = np.transpose(sample["image"], (2, 0, 1))
+        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+        if "depth" in sample:
+            depth = sample["depth"].astype(np.float32)
+            sample["depth"] = np.ascontiguousarray(depth)
+        if "mask" in sample:
+            sample["mask"] = sample["mask"].astype(np.float32)
+            sample["mask"] = np.ascontiguousarray(sample["mask"])
+        return sample

extern/video_depth_anything/util/util.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (2025) Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+def compute_scale_and_shift(prediction, target, mask, scale_only=False):
+    if scale_only:
+        return compute_scale(prediction, target, mask), 0
+    else:
+        return compute_scale_and_shift_full(prediction, target, mask)
+def compute_scale(prediction, target, mask):
+    # system matrix: A = [[a_00, a_01], [a_10, a_11]]
+    prediction = prediction.astype(np.float32)
+    target = target.astype(np.float32)
+    mask = mask.astype(np.float32)
+    a_00 = np.sum(mask * prediction * prediction)
+    a_01 = np.sum(mask * prediction)
+    a_11 = np.sum(mask)
+    # right hand side: b = [b_0, b_1]
+    b_0 = np.sum(mask * prediction * target)
+    x_0 = b_0 / (a_00 + 1e-6)
+    return x_0
+def compute_scale_and_shift_full(prediction, target, mask):
+    # system matrix: A = [[a_00, a_01], [a_10, a_11]]
+    prediction = prediction.astype(np.float32)
+    target = target.astype(np.float32)
+    mask = mask.astype(np.float32)
+    a_00 = np.sum(mask * prediction * prediction)
+    a_01 = np.sum(mask * prediction)
+    a_11 = np.sum(mask)
+    b_0 = np.sum(mask * prediction * target)
+    b_1 = np.sum(mask * target)
+    x_0 = 1
+    x_1 = 0
+    det = a_00 * a_11 - a_01 * a_01
+    if det != 0:
+        x_0 = (a_11 * b_0 - a_01 * b_1) / det
+        x_1 = (-a_01 * b_0 + a_00 * b_1) / det
+    return x_0, x_1
+def get_interpolate_frames(frame_list_pre, frame_list_post):
+    assert len(frame_list_pre) == len(frame_list_post)
+    min_w = 0.0
+    max_w = 1.0
+    step = (max_w - min_w) / (len(frame_list_pre)-1)
+    post_w_list = [min_w] + [i * step for i in range(1,len(frame_list_pre)-1)] + [max_w]
+    interpolated_frames = []
+    for i in range(len(frame_list_pre)):
+        interpolated_frames.append(frame_list_pre[i] * (1-post_w_list[i]) + frame_list_post[i] * post_w_list[i])
+    return interpolated_frames

extern/video_depth_anything/vdademo.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (2025) Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import numpy as np
+import os
+import torch
+from extern.video_depth_anything.video_depth import VideoDepthAnything
+class VDADemo:
+    def __init__(
+        self,
+        pre_train_path: str,
+        encoder: str = "vitl",
+        device: str = "cuda:0",
+    ):
+        model_configs = {
+            'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+            'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+        }
+        self.video_depth_anything = VideoDepthAnything(**model_configs[encoder])
+        self.video_depth_anything.load_state_dict(torch.load(pre_train_path, map_location='cpu'), strict=True)
+        self.video_depth_anything = self.video_depth_anything.to(device).eval()
+        self.device = device
+    def infer(
+        self,
+        frames,
+        near,
+        far,
+        input_size = 518,
+        target_fps = -1,
+    ):
+        if frames.max() < 2.:
+            frames = frames*255.
+        with torch.inference_mode():
+            depths, fps = self.video_depth_anything.infer_video_depth(frames, target_fps, input_size, self.device)
+        depths = torch.from_numpy(depths).unsqueeze(1) # 49 576 1024 ->
+        depths[depths < 1e-5] = 1e-5
+        depths = 10000. / depths
+        depths = depths.clip(near, far)
+        return depths

extern/video_depth_anything/video_depth.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# Copyright (2025) Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torchvision.transforms import Compose
+import cv2
+from tqdm import tqdm
+import numpy as np
+import gc
+from extern.video_depth_anything.dinov2 import DINOv2
+from extern.video_depth_anything.dpt_temporal import DPTHeadTemporal
+from extern.video_depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
+from extern.video_depth_anything.util.util import compute_scale_and_shift, get_interpolate_frames
+# infer settings, do not change
+INFER_LEN = 32
+OVERLAP = 10
+KEYFRAMES = [0,12,24,25,26,27,28,29,30,31]
+INTERP_LEN = 8
+class VideoDepthAnything(nn.Module):
+    def __init__(
+        self,
+        encoder='vitl',
+        features=256,
+        out_channels=[256, 512, 1024, 1024],
+        use_bn=False,
+        use_clstoken=False,
+        num_frames=32,
+        pe='ape'
+    ):
+        super(VideoDepthAnything, self).__init__()
+        self.intermediate_layer_idx = {
+            'vits': [2, 5, 8, 11],
+            'vitl': [4, 11, 17, 23]
+        }
+        self.encoder = encoder
+        self.pretrained = DINOv2(model_name=encoder)
+        self.head = DPTHeadTemporal(self.pretrained.embed_dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken, num_frames=num_frames, pe=pe)
+    def forward(self, x):
+        B, T, C, H, W = x.shape
+        patch_h, patch_w = H // 14, W // 14
+        features = self.pretrained.get_intermediate_layers(x.flatten(0,1), self.intermediate_layer_idx[self.encoder], return_class_token=True)
+        depth = self.head(features, patch_h, patch_w, T)
+        depth = F.interpolate(depth, size=(H, W), mode="bilinear", align_corners=True)
+        depth = F.relu(depth)
+        return depth.squeeze(1).unflatten(0, (B, T)) # return shape [B, T, H, W]
+    def infer_video_depth(self, frames, target_fps, input_size=518, device='cuda'):
+        frame_height, frame_width = frames[0].shape[:2]
+        ratio = max(frame_height, frame_width) / min(frame_height, frame_width)
+        if ratio > 1.78:  # we recommend to process video with ratio smaller than 16:9 due to memory limitation
+            input_size = int(input_size * 1.777 / ratio)
+            input_size = round(input_size / 14) * 14
+        transform = Compose([
+            Resize(
+                width=input_size,
+                height=input_size,
+                resize_target=False,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=14,
+                resize_method='lower_bound',
+                image_interpolation_method=cv2.INTER_CUBIC,
+            ),
+            NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            PrepareForNet(),
+        ])
+        frame_list = [frames[i] for i in range(frames.shape[0])]
+        frame_step = INFER_LEN - OVERLAP
+        org_video_len = len(frame_list)
+        append_frame_len = (frame_step - (org_video_len % frame_step)) % frame_step + (INFER_LEN - frame_step)
+        frame_list = frame_list + [frame_list[-1].copy()] * append_frame_len
+        depth_list = []
+        pre_input = None
+        for frame_id in tqdm(range(0, org_video_len, frame_step)):
+            cur_list = []
+            for i in range(INFER_LEN):
+                cur_list.append(torch.from_numpy(transform({'image': frame_list[frame_id+i].astype(np.float32) / 255.0})['image']).unsqueeze(0).unsqueeze(0))
+            cur_input = torch.cat(cur_list, dim=1).to(device)
+            if pre_input is not None:
+                cur_input[:, :OVERLAP, ...] = pre_input[:, KEYFRAMES, ...]
+            with torch.no_grad():
+                depth = self.forward(cur_input) # depth shape: [1, T, H, W]
+            depth = F.interpolate(depth.flatten(0,1).unsqueeze(1), size=(frame_height, frame_width), mode='bilinear', align_corners=True)
+            depth_list += [depth[i][0].cpu().numpy() for i in range(depth.shape[0])]
+            pre_input = cur_input
+        del frame_list
+        gc.collect()
+        depth_list_aligned = []
+        ref_align = []
+        align_len = OVERLAP - INTERP_LEN
+        kf_align_list = KEYFRAMES[:align_len]
+        for frame_id in range(0, len(depth_list), INFER_LEN):
+            if len(depth_list_aligned) == 0:
+                depth_list_aligned += depth_list[:INFER_LEN]
+                for kf_id in kf_align_list:
+                    ref_align.append(depth_list[frame_id+kf_id])
+            else:
+                curr_align = []
+                for i in range(len(kf_align_list)):
+                    curr_align.append(depth_list[frame_id+i])
+                scale, shift = compute_scale_and_shift(np.concatenate(curr_align),
+                                                       np.concatenate(ref_align),
+                                                       np.concatenate(np.ones_like(ref_align)==1))
+                pre_depth_list = depth_list_aligned[-INTERP_LEN:]
+                post_depth_list = depth_list[frame_id+align_len:frame_id+OVERLAP]
+                for i in range(len(post_depth_list)):
+                    post_depth_list[i] = post_depth_list[i] * scale + shift
+                    post_depth_list[i][post_depth_list[i]<0] = 0
+                depth_list_aligned[-INTERP_LEN:] = get_interpolate_frames(pre_depth_list, post_depth_list)
+                for i in range(OVERLAP, INFER_LEN):
+                    new_depth = depth_list[frame_id+i] * scale + shift
+                    new_depth[new_depth<0] = 0
+                    depth_list_aligned.append(new_depth)
+                ref_align = ref_align[:1]
+                for kf_id in kf_align_list[1:]:
+                    new_depth = depth_list[frame_id+kf_id] * scale + shift
+                    new_depth[new_depth<0] = 0
+                    ref_align.append(new_depth)
+        depth_list = depth_list_aligned
+        return np.stack(depth_list[:org_video_len], axis=0), target_fps