Depth-Anything-Video

Running

App Files Files Community

freealise commited on Oct 21

Commit

00e81be

•

1 Parent(s): ac1033d

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -52

app.py CHANGED Viewed

@@ -17,11 +17,27 @@ from vincenty import vincenty
 import json
 #import DracoPy
 from collections import Counter
-from depth_anything.dpt import DepthAnything
-from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
 import mediapy
 edge = []
 gradient = None
 params = { "fnum":0, "l":16 }
@@ -54,43 +70,53 @@ def create_video(frames, fps, type):
     return type + "_result.mp4"
 @torch.no_grad()
-def predict_depth(model, image):
-    return model(image)["depth"]
 #@spaces.GPU
 def make_video(video_path, outdir='./vis_video_depth', encoder='vits', remove_bg=False):
     if encoder not in ["vitl","vitb","vits"]:
         encoder = "vits"
-    mapper = {"vits":"small","vitb":"base","vitl":"large"}
     # DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
     # model = DepthAnything.from_pretrained('LiheYoung/depth_anything_vitl14').to(DEVICE).eval()
     # Define path for temporary processed frames
-    temp_frame_dir = tempfile.mkdtemp()
-    margin_width = 50
-    to_tensor_transform = transforms.ToTensor()
-    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
     # depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{}14'.format(encoder)).to(DEVICE).eval()
-    depth_anything = pipeline(task = "depth-estimation", model=f"nielsr/depth-anything-{mapper[encoder]}")
     # total_params = sum(param.numel() for param in depth_anything.parameters())
     # print('Total parameters: {:.2f}M'.format(total_params / 1e6))
-    transform = Compose([
-        Resize(
-            width=518,
-            height=518,
-            resize_target=False,
-            keep_aspect_ratio=True,
-            ensure_multiple_of=14,
-            resize_method='lower_bound',
-            image_interpolation_method=cv2.INTER_CUBIC,
-        ),
-        NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-        PrepareForNet(),
-    ])
     if os.path.isfile(video_path):
         if video_path.endswith('txt'):
@@ -144,19 +170,23 @@ def make_video(video_path, outdir='./vis_video_depth', encoder='vits', remove_bg
             frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2RGB) / 255.0
             frame_pil = Image.fromarray((frame * 255).astype(np.uint8))
-            frame = transform({'image': frame})['image']
-            frame = torch.from_numpy(frame).unsqueeze(0).to(DEVICE)
-            depth = to_tensor_transform(predict_depth(depth_anything, frame_pil))
-            depth = F.interpolate(depth[None], (frame_height, frame_width), mode='bilinear', align_corners=False)[0, 0]
             depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
-            depth = depth.cpu().numpy().astype(np.uint8)
-            depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_BONE)
             depth_gray = cv2.cvtColor(depth_color, cv2.COLOR_RGBA2GRAY)
             # Remove white border around map:
             # define lower and upper limits of white
@@ -896,23 +926,24 @@ css = """
     }
 """
-title = "# Depth Anything Video Demo"
-description = """Depth Anything on full video files.
-Please refer to our [paper](https://arxiv.org/abs/2401.10891), [project page](https://depth-anything.github.io), or [github](https://github.com/LiheYoung/Depth-Anything) for more details."""
-transform = Compose([
-        Resize(
-            width=518,
-            height=518,
-            resize_target=False,
-            keep_aspect_ratio=True,
-            ensure_multiple_of=14,
-            resize_method='lower_bound',
-            image_interpolation_method=cv2.INTER_CUBIC,
-        ),
-        NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-        PrepareForNet(),
-])
 # @torch.no_grad()
 # def predict_depth(model, image):

 import json
 #import DracoPy
 from collections import Counter
 import mediapy
+#from depth_anything.dpt import DepthAnything
+#from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
+from huggingface_hub import hf_hub_download
+from depth_anything_v2.dpt import DepthAnythingV2
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+model_configs = {
+    'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+    'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
+    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+    'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
+}
+encoder2name = {
+    'vits': 'Small',
+    'vitb': 'Base',
+    'vitl': 'Large',
+    'vitg': 'Giant', # we are undergoing company review procedures to release our giant model checkpoint
+}
 edge = []
 gradient = None
 params = { "fnum":0, "l":16 }
     return type + "_result.mp4"
 @torch.no_grad()
 #@spaces.GPU
+def predict_depth(image, model):
+    return model.infer_image(image)
+#def predict_depth(model, image):
+#    return model(image)["depth"]
 def make_video(video_path, outdir='./vis_video_depth', encoder='vits', remove_bg=False):
     if encoder not in ["vitl","vitb","vits"]:
         encoder = "vits"
+    model_name = encoder2name[encoder]
+    model = DepthAnythingV2(**model_configs[encoder])
+    filepath = hf_hub_download(repo_id=f"depth-anything/Depth-Anything-V2-{model_name}", filename=f"depth_anything_v2_{encoder}.pth", repo_type="model")
+    state_dict = torch.load(filepath, map_location="cpu")
+    model.load_state_dict(state_dict)
+    model = model.to(DEVICE).eval()
+    #mapper = {"vits":"small","vitb":"base","vitl":"large"}
     # DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
     # model = DepthAnything.from_pretrained('LiheYoung/depth_anything_vitl14').to(DEVICE).eval()
     # Define path for temporary processed frames
+    #temp_frame_dir = tempfile.mkdtemp()
+    #margin_width = 50
+    #to_tensor_transform = transforms.ToTensor()
+    #DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
     # depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{}14'.format(encoder)).to(DEVICE).eval()
+    #depth_anything = pipeline(task = "depth-estimation", model=f"nielsr/depth-anything-{mapper[encoder]}")
     # total_params = sum(param.numel() for param in depth_anything.parameters())
     # print('Total parameters: {:.2f}M'.format(total_params / 1e6))
+    #transform = Compose([
+    #    Resize(
+    #        width=518,
+    #        height=518,
+    #        resize_target=False,
+    #        keep_aspect_ratio=True,
+    #        ensure_multiple_of=14,
+    #        resize_method='lower_bound',
+    #        image_interpolation_method=cv2.INTER_CUBIC,
+    #    ),
+    #    NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    #    PrepareForNet(),
+    #])
     if os.path.isfile(video_path):
         if video_path.endswith('txt'):
             frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2RGB) / 255.0
             frame_pil = Image.fromarray((frame * 255).astype(np.uint8))
+            #frame = transform({'image': frame})['image']
+            #frame = torch.from_numpy(frame).unsqueeze(0).to(DEVICE)
+            #
+            depth = predict_depth(raw_frame[:, :, ::-1], model)
             depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
+            depth = depth.astype(np.uint8)
+            depth_color = Image.fromarray(depth)
             depth_gray = cv2.cvtColor(depth_color, cv2.COLOR_RGBA2GRAY)
+            #
+            #depth = to_tensor_transform(predict_depth(depth_anything, frame_pil))
+            #depth = F.interpolate(depth[None], (frame_height, frame_width), mode='bilinear', align_corners=False)[0, 0]
+            #depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
+            #depth = depth.cpu().numpy().astype(np.uint8)
+            #depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_BONE)
+            #depth_gray = cv2.cvtColor(depth_color, cv2.COLOR_RGBA2GRAY)
             # Remove white border around map:
             # define lower and upper limits of white
     }
 """
+title = "# Depth Anything V2 Video"
+description = """**Depth Anything V2** on full video files.
+Please refer to our [paper](https://arxiv.org/abs/2406.09414), [project page](https://depth-anything-v2.github.io), and [github](https://github.com/DepthAnything/Depth-Anything-V2) for more details."""
+#transform = Compose([
+#        Resize(
+#            width=518,
+#            height=518,
+#            resize_target=False,
+#            keep_aspect_ratio=True,
+#            ensure_multiple_of=14,
+#            resize_method='lower_bound',
+#            image_interpolation_method=cv2.INTER_CUBIC,
+#        ),
+#        NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+#        PrepareForNet(),
+#])
 # @torch.no_grad()
 # def predict_depth(model, image):