Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -17,11 +17,27 @@ from vincenty import vincenty
|
|
17 |
import json
|
18 |
#import DracoPy
|
19 |
from collections import Counter
|
20 |
-
|
21 |
-
from depth_anything.dpt import DepthAnything
|
22 |
-
from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
|
23 |
import mediapy
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
edge = []
|
26 |
gradient = None
|
27 |
params = { "fnum":0, "l":16 }
|
@@ -54,43 +70,53 @@ def create_video(frames, fps, type):
|
|
54 |
return type + "_result.mp4"
|
55 |
|
56 |
@torch.no_grad()
|
57 |
-
def predict_depth(model, image):
|
58 |
-
return model(image)["depth"]
|
59 |
-
|
60 |
#@spaces.GPU
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def make_video(video_path, outdir='./vis_video_depth', encoder='vits', remove_bg=False):
|
62 |
if encoder not in ["vitl","vitb","vits"]:
|
63 |
encoder = "vits"
|
64 |
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
# DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
67 |
# model = DepthAnything.from_pretrained('LiheYoung/depth_anything_vitl14').to(DEVICE).eval()
|
68 |
# Define path for temporary processed frames
|
69 |
-
temp_frame_dir = tempfile.mkdtemp()
|
70 |
|
71 |
-
margin_width = 50
|
72 |
-
to_tensor_transform = transforms.ToTensor()
|
73 |
|
74 |
-
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
75 |
# depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{}14'.format(encoder)).to(DEVICE).eval()
|
76 |
-
depth_anything = pipeline(task = "depth-estimation", model=f"nielsr/depth-anything-{mapper[encoder]}")
|
77 |
|
78 |
# total_params = sum(param.numel() for param in depth_anything.parameters())
|
79 |
# print('Total parameters: {:.2f}M'.format(total_params / 1e6))
|
80 |
|
81 |
-
transform = Compose([
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
])
|
94 |
|
95 |
if os.path.isfile(video_path):
|
96 |
if video_path.endswith('txt'):
|
@@ -144,19 +170,23 @@ def make_video(video_path, outdir='./vis_video_depth', encoder='vits', remove_bg
|
|
144 |
|
145 |
frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2RGB) / 255.0
|
146 |
frame_pil = Image.fromarray((frame * 255).astype(np.uint8))
|
147 |
-
frame = transform({'image': frame})['image']
|
148 |
-
|
149 |
-
frame = torch.from_numpy(frame).unsqueeze(0).to(DEVICE)
|
150 |
-
|
151 |
-
|
152 |
-
depth = to_tensor_transform(predict_depth(depth_anything, frame_pil))
|
153 |
|
154 |
-
|
|
|
155 |
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
|
156 |
-
|
157 |
-
|
158 |
-
depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_BONE)
|
159 |
depth_gray = cv2.cvtColor(depth_color, cv2.COLOR_RGBA2GRAY)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
|
161 |
# Remove white border around map:
|
162 |
# define lower and upper limits of white
|
@@ -896,23 +926,24 @@ css = """
|
|
896 |
}
|
897 |
"""
|
898 |
|
899 |
-
title = "# Depth Anything Video
|
900 |
-
description = """Depth Anything on full video files.
|
901 |
-
Please refer to our [paper](https://arxiv.org/abs/
|
902 |
-
|
903 |
-
|
904 |
-
|
905 |
-
|
906 |
-
|
907 |
-
|
908 |
-
|
909 |
-
|
910 |
-
|
911 |
-
|
912 |
-
|
913 |
-
|
914 |
-
|
915 |
-
|
|
|
916 |
|
917 |
# @torch.no_grad()
|
918 |
# def predict_depth(model, image):
|
|
|
17 |
import json
|
18 |
#import DracoPy
|
19 |
from collections import Counter
|
|
|
|
|
|
|
20 |
import mediapy
|
21 |
|
22 |
+
#from depth_anything.dpt import DepthAnything
|
23 |
+
#from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
|
24 |
+
from huggingface_hub import hf_hub_download
|
25 |
+
from depth_anything_v2.dpt import DepthAnythingV2
|
26 |
+
|
27 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
28 |
+
model_configs = {
|
29 |
+
'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
|
30 |
+
'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
|
31 |
+
'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
|
32 |
+
'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
|
33 |
+
}
|
34 |
+
encoder2name = {
|
35 |
+
'vits': 'Small',
|
36 |
+
'vitb': 'Base',
|
37 |
+
'vitl': 'Large',
|
38 |
+
'vitg': 'Giant', # we are undergoing company review procedures to release our giant model checkpoint
|
39 |
+
}
|
40 |
+
|
41 |
edge = []
|
42 |
gradient = None
|
43 |
params = { "fnum":0, "l":16 }
|
|
|
70 |
return type + "_result.mp4"
|
71 |
|
72 |
@torch.no_grad()
|
|
|
|
|
|
|
73 |
#@spaces.GPU
|
74 |
+
def predict_depth(image, model):
|
75 |
+
return model.infer_image(image)
|
76 |
+
|
77 |
+
#def predict_depth(model, image):
|
78 |
+
# return model(image)["depth"]
|
79 |
+
|
80 |
def make_video(video_path, outdir='./vis_video_depth', encoder='vits', remove_bg=False):
|
81 |
if encoder not in ["vitl","vitb","vits"]:
|
82 |
encoder = "vits"
|
83 |
|
84 |
+
model_name = encoder2name[encoder]
|
85 |
+
model = DepthAnythingV2(**model_configs[encoder])
|
86 |
+
filepath = hf_hub_download(repo_id=f"depth-anything/Depth-Anything-V2-{model_name}", filename=f"depth_anything_v2_{encoder}.pth", repo_type="model")
|
87 |
+
state_dict = torch.load(filepath, map_location="cpu")
|
88 |
+
model.load_state_dict(state_dict)
|
89 |
+
model = model.to(DEVICE).eval()
|
90 |
+
|
91 |
+
#mapper = {"vits":"small","vitb":"base","vitl":"large"}
|
92 |
# DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
93 |
# model = DepthAnything.from_pretrained('LiheYoung/depth_anything_vitl14').to(DEVICE).eval()
|
94 |
# Define path for temporary processed frames
|
95 |
+
#temp_frame_dir = tempfile.mkdtemp()
|
96 |
|
97 |
+
#margin_width = 50
|
98 |
+
#to_tensor_transform = transforms.ToTensor()
|
99 |
|
100 |
+
#DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
101 |
# depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{}14'.format(encoder)).to(DEVICE).eval()
|
102 |
+
#depth_anything = pipeline(task = "depth-estimation", model=f"nielsr/depth-anything-{mapper[encoder]}")
|
103 |
|
104 |
# total_params = sum(param.numel() for param in depth_anything.parameters())
|
105 |
# print('Total parameters: {:.2f}M'.format(total_params / 1e6))
|
106 |
|
107 |
+
#transform = Compose([
|
108 |
+
# Resize(
|
109 |
+
# width=518,
|
110 |
+
# height=518,
|
111 |
+
# resize_target=False,
|
112 |
+
# keep_aspect_ratio=True,
|
113 |
+
# ensure_multiple_of=14,
|
114 |
+
# resize_method='lower_bound',
|
115 |
+
# image_interpolation_method=cv2.INTER_CUBIC,
|
116 |
+
# ),
|
117 |
+
# NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
118 |
+
# PrepareForNet(),
|
119 |
+
#])
|
120 |
|
121 |
if os.path.isfile(video_path):
|
122 |
if video_path.endswith('txt'):
|
|
|
170 |
|
171 |
frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2RGB) / 255.0
|
172 |
frame_pil = Image.fromarray((frame * 255).astype(np.uint8))
|
173 |
+
#frame = transform({'image': frame})['image']
|
174 |
+
#frame = torch.from_numpy(frame).unsqueeze(0).to(DEVICE)
|
|
|
|
|
|
|
|
|
175 |
|
176 |
+
#
|
177 |
+
depth = predict_depth(raw_frame[:, :, ::-1], model)
|
178 |
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
|
179 |
+
depth = depth.astype(np.uint8)
|
180 |
+
depth_color = Image.fromarray(depth)
|
|
|
181 |
depth_gray = cv2.cvtColor(depth_color, cv2.COLOR_RGBA2GRAY)
|
182 |
+
#
|
183 |
+
|
184 |
+
#depth = to_tensor_transform(predict_depth(depth_anything, frame_pil))
|
185 |
+
#depth = F.interpolate(depth[None], (frame_height, frame_width), mode='bilinear', align_corners=False)[0, 0]
|
186 |
+
#depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
|
187 |
+
#depth = depth.cpu().numpy().astype(np.uint8)
|
188 |
+
#depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_BONE)
|
189 |
+
#depth_gray = cv2.cvtColor(depth_color, cv2.COLOR_RGBA2GRAY)
|
190 |
|
191 |
# Remove white border around map:
|
192 |
# define lower and upper limits of white
|
|
|
926 |
}
|
927 |
"""
|
928 |
|
929 |
+
title = "# Depth Anything V2 Video"
|
930 |
+
description = """**Depth Anything V2** on full video files.
|
931 |
+
Please refer to our [paper](https://arxiv.org/abs/2406.09414), [project page](https://depth-anything-v2.github.io), and [github](https://github.com/DepthAnything/Depth-Anything-V2) for more details."""
|
932 |
+
|
933 |
+
|
934 |
+
#transform = Compose([
|
935 |
+
# Resize(
|
936 |
+
# width=518,
|
937 |
+
# height=518,
|
938 |
+
# resize_target=False,
|
939 |
+
# keep_aspect_ratio=True,
|
940 |
+
# ensure_multiple_of=14,
|
941 |
+
# resize_method='lower_bound',
|
942 |
+
# image_interpolation_method=cv2.INTER_CUBIC,
|
943 |
+
# ),
|
944 |
+
# NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
945 |
+
# PrepareForNet(),
|
946 |
+
#])
|
947 |
|
948 |
# @torch.no_grad()
|
949 |
# def predict_depth(model, image):
|