SkalskiP commited on
Commit
d336c38
1 Parent(s): a1ddcc8

Initial video processing

Browse files
Files changed (4) hide show
  1. app.py +171 -53
  2. requirements.txt +2 -1
  3. utils/efficient_sam.py +14 -0
  4. utils/video.py +27 -0
app.py CHANGED
@@ -1,18 +1,21 @@
1
  from typing import List
2
 
 
3
  import cv2
4
  import gradio as gr
5
  import numpy as np
6
  import supervision as sv
7
  import torch
 
8
  from inference.models import YOLOWorld
9
 
10
- from utils.efficient_sam import load, inference_with_box
 
11
 
12
  MARKDOWN = """
13
  # YOLO-World + EfficientSAM 🔥
14
 
15
- This is a demo of zero-shot instance segmentation using
16
  [YOLO-World](https://github.com/AILab-CVC/YOLO-World) and
17
  [EfficientSAM](https://github.com/yformer/EfficientSAM).
18
 
@@ -20,9 +23,15 @@ Powered by Roboflow [Inference](https://github.com/roboflow/inference) and
20
  [Supervision](https://github.com/roboflow/supervision).
21
  """
22
 
23
- EXAMPLES = [
 
 
24
  ['https://media.roboflow.com/dog.jpeg', 'dog, eye, nose, tongue, car', 0.005, 0.1, True, False, False],
25
  ]
 
 
 
 
26
 
27
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28
  EFFICIENT_SAM_MODEL = load(device=DEVICE)
@@ -33,10 +42,34 @@ MASK_ANNOTATOR = sv.MaskAnnotator()
33
  LABEL_ANNOTATOR = sv.LabelAnnotator()
34
 
35
 
 
 
 
36
  def process_categories(categories: str) -> List[str]:
37
  return [category.strip() for category in categories.split(',')]
38
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def process_image(
41
  input_image: np.ndarray,
42
  categories: str,
@@ -52,31 +85,69 @@ def process_image(
52
  detections = sv.Detections.from_inference(results)
53
  detections = detections.with_nms(
54
  class_agnostic=with_class_agnostic_nms,
55
- threshold=iou_threshold)
 
56
  if with_segmentation:
57
- masks = []
58
- for [x_min, y_min, x_max, y_max] in detections.xyxy:
59
- box = np.array([[x_min, y_min], [x_max, y_max]])
60
- mask = inference_with_box(input_image, box, EFFICIENT_SAM_MODEL, DEVICE)
61
- masks.append(mask)
62
- detections.mask = np.array(masks)
63
-
64
- labels = [
65
- (
66
- f"{categories[class_id]}: {confidence:.2f}"
67
- if with_confidence
68
- else f"{categories[class_id]}"
69
  )
70
- for class_id, confidence in
71
- zip(detections.class_id, detections.confidence)
72
- ]
73
- output_image = input_image.copy()
74
- output_image = cv2.cvtColor(output_image, cv2.COLOR_RGB2BGR)
75
- output_image = MASK_ANNOTATOR.annotate(output_image, detections)
76
- output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
77
- output_image = LABEL_ANNOTATOR.annotate(output_image, detections, labels=labels)
78
- output_image = cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)
79
- return output_image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
 
82
  confidence_threshold_component = gr.Slider(
@@ -140,32 +211,80 @@ with gr.Blocks() as demo:
140
  with_segmentation_component.render()
141
  with_confidence_component.render()
142
  with_class_agnostic_nms_component.render()
143
- with gr.Row():
144
- input_image_component = gr.Image(
145
- type='numpy',
146
- label='Input Image'
147
- )
148
- output_image_component = gr.Image(
149
- type='numpy',
150
- label='Output Image'
151
- )
152
- with gr.Row():
153
- categories_text_component = gr.Textbox(
154
- label='Categories',
155
- placeholder='comma separated list of categories',
156
- scale=7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  )
158
- submit_button_component = gr.Button(
159
- value='Submit',
160
- scale=1,
161
- variant='primary'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  )
163
- gr.Examples(
 
164
  fn=process_image,
165
- examples=EXAMPLES,
166
  inputs=[
167
  input_image_component,
168
- categories_text_component,
169
  confidence_threshold_component,
170
  iou_threshold_component,
171
  with_segmentation_component,
@@ -174,19 +293,18 @@ with gr.Blocks() as demo:
174
  ],
175
  outputs=output_image_component
176
  )
177
-
178
- submit_button_component.click(
179
- fn=process_image,
180
  inputs=[
181
- input_image_component,
182
- categories_text_component,
183
  confidence_threshold_component,
184
  iou_threshold_component,
185
  with_segmentation_component,
186
  with_confidence_component,
187
  with_class_agnostic_nms_component
188
  ],
189
- outputs=output_image_component
190
  )
191
 
192
  demo.launch(debug=False, show_error=True)
 
1
  from typing import List
2
 
3
+ import os
4
  import cv2
5
  import gradio as gr
6
  import numpy as np
7
  import supervision as sv
8
  import torch
9
+ from tqdm import tqdm
10
  from inference.models import YOLOWorld
11
 
12
+ from utils.efficient_sam import load, inference_with_boxes
13
+ from utils.video import generate_file_name, calculate_end_frame_index, create_directory
14
 
15
  MARKDOWN = """
16
  # YOLO-World + EfficientSAM 🔥
17
 
18
+ This is a demo of zero-shot object detection and instance segmentation using
19
  [YOLO-World](https://github.com/AILab-CVC/YOLO-World) and
20
  [EfficientSAM](https://github.com/yformer/EfficientSAM).
21
 
 
23
  [Supervision](https://github.com/roboflow/supervision).
24
  """
25
 
26
+ RESULTS = "results"
27
+
28
+ IMAGE_EXAMPLES = [
29
  ['https://media.roboflow.com/dog.jpeg', 'dog, eye, nose, tongue, car', 0.005, 0.1, True, False, False],
30
  ]
31
+ VIDEO_EXAMPLES = [
32
+ ['https://media.roboflow.com/supervision/video-examples/croissant-1280x720.mp4', 'croissant', 0.01, 0.2, False, False, False],
33
+ ['https://media.roboflow.com/supervision/video-examples/suitcases-1280x720.mp4', 'suitcase', 0.1, 0.2, False, False, False],
34
+ ]
35
 
36
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
  EFFICIENT_SAM_MODEL = load(device=DEVICE)
 
42
  LABEL_ANNOTATOR = sv.LabelAnnotator()
43
 
44
 
45
+ create_directory(directory_path=RESULTS)
46
+
47
+
48
  def process_categories(categories: str) -> List[str]:
49
  return [category.strip() for category in categories.split(',')]
50
 
51
 
52
+ def annotate_image(
53
+ input_image: np.ndarray,
54
+ detections: sv.Detections,
55
+ categories: List[str],
56
+ with_confidence: bool = False,
57
+ ) -> np.ndarray:
58
+ labels = [
59
+ (
60
+ f"{categories[class_id]}: {confidence:.3f}"
61
+ if with_confidence
62
+ else f"{categories[class_id]}"
63
+ )
64
+ for class_id, confidence in
65
+ zip(detections.class_id, detections.confidence)
66
+ ]
67
+ output_image = MASK_ANNOTATOR.annotate(input_image, detections)
68
+ output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
69
+ output_image = LABEL_ANNOTATOR.annotate(output_image, detections, labels=labels)
70
+ return output_image
71
+
72
+
73
  def process_image(
74
  input_image: np.ndarray,
75
  categories: str,
 
85
  detections = sv.Detections.from_inference(results)
86
  detections = detections.with_nms(
87
  class_agnostic=with_class_agnostic_nms,
88
+ threshold=iou_threshold
89
+ )
90
  if with_segmentation:
91
+ detections.mask = inference_with_boxes(
92
+ image=input_image,
93
+ xyxy=detections.xyxy,
94
+ model=EFFICIENT_SAM_MODEL,
95
+ device=DEVICE
 
 
 
 
 
 
 
96
  )
97
+ output_image = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR)
98
+ output_image = annotate_image(
99
+ input_image=output_image,
100
+ detections=detections,
101
+ categories=categories,
102
+ with_confidence=with_confidence
103
+ )
104
+ return cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)
105
+
106
+
107
+ def process_video(
108
+ input_video: str,
109
+ categories: str,
110
+ confidence_threshold: float = 0.3,
111
+ iou_threshold: float = 0.5,
112
+ with_segmentation: bool = True,
113
+ with_confidence: bool = False,
114
+ with_class_agnostic_nms: bool = False,
115
+ progress=gr.Progress(track_tqdm=True)
116
+ ) -> str:
117
+ categories = process_categories(categories)
118
+ YOLO_WORLD_MODEL.set_classes(categories)
119
+ video_info = sv.VideoInfo.from_video_path(input_video)
120
+ total = calculate_end_frame_index(input_video)
121
+ frame_generator = sv.get_video_frames_generator(
122
+ source_path=input_video,
123
+ end=total
124
+ )
125
+ result_file_name = generate_file_name(extension="mp4")
126
+ result_file_path = os.path.join(RESULTS, result_file_name)
127
+ with sv.VideoSink(result_file_path, video_info=video_info) as sink:
128
+ for _ in tqdm(range(total), desc="Processing video..."):
129
+ frame = next(frame_generator)
130
+ results = YOLO_WORLD_MODEL.infer(frame, confidence=confidence_threshold)
131
+ detections = sv.Detections.from_inference(results)
132
+ detections = detections.with_nms(
133
+ class_agnostic=with_class_agnostic_nms,
134
+ threshold=iou_threshold
135
+ )
136
+ if with_segmentation:
137
+ detections.mask = inference_with_boxes(
138
+ image=frame,
139
+ xyxy=detections.xyxy,
140
+ model=EFFICIENT_SAM_MODEL,
141
+ device=DEVICE
142
+ )
143
+ frame = annotate_image(
144
+ input_image=frame,
145
+ detections=detections,
146
+ categories=categories,
147
+ with_confidence=with_confidence
148
+ )
149
+ sink.write_frame(frame)
150
+ return result_file_path
151
 
152
 
153
  confidence_threshold_component = gr.Slider(
 
211
  with_segmentation_component.render()
212
  with_confidence_component.render()
213
  with_class_agnostic_nms_component.render()
214
+ with gr.Tab(label="Image"):
215
+ with gr.Row():
216
+ input_image_component = gr.Image(
217
+ type='numpy',
218
+ label='Input Image'
219
+ )
220
+ output_image_component = gr.Image(
221
+ type='numpy',
222
+ label='Output Image'
223
+ )
224
+ with gr.Row():
225
+ image_categories_text_component = gr.Textbox(
226
+ label='Categories',
227
+ placeholder='comma separated list of categories',
228
+ scale=7
229
+ )
230
+ image_submit_button_component = gr.Button(
231
+ value='Submit',
232
+ scale=1,
233
+ variant='primary'
234
+ )
235
+ gr.Examples(
236
+ fn=process_image,
237
+ examples=IMAGE_EXAMPLES,
238
+ inputs=[
239
+ input_image_component,
240
+ image_categories_text_component,
241
+ confidence_threshold_component,
242
+ iou_threshold_component,
243
+ with_segmentation_component,
244
+ with_confidence_component,
245
+ with_class_agnostic_nms_component
246
+ ],
247
+ outputs=output_image_component
248
  )
249
+ with gr.Tab(label="Video"):
250
+ with gr.Row():
251
+ input_video_component = gr.Video(
252
+ label='Input Video'
253
+ )
254
+ output_video_component = gr.Video(
255
+ label='Output Video'
256
+ )
257
+ with gr.Row():
258
+ video_categories_text_component = gr.Textbox(
259
+ label='Categories',
260
+ placeholder='comma separated list of categories',
261
+ scale=7
262
+ )
263
+ video_submit_button_component = gr.Button(
264
+ value='Submit',
265
+ scale=1,
266
+ variant='primary'
267
+ )
268
+ gr.Examples(
269
+ fn=process_video,
270
+ examples=VIDEO_EXAMPLES,
271
+ inputs=[
272
+ input_video_component,
273
+ video_categories_text_component,
274
+ confidence_threshold_component,
275
+ iou_threshold_component,
276
+ with_segmentation_component,
277
+ with_confidence_component,
278
+ with_class_agnostic_nms_component
279
+ ],
280
+ outputs=output_image_component
281
  )
282
+
283
+ image_submit_button_component.click(
284
  fn=process_image,
 
285
  inputs=[
286
  input_image_component,
287
+ image_categories_text_component,
288
  confidence_threshold_component,
289
  iou_threshold_component,
290
  with_segmentation_component,
 
293
  ],
294
  outputs=output_image_component
295
  )
296
+ video_submit_button_component.click(
297
+ fn=process_video,
 
298
  inputs=[
299
+ input_video_component,
300
+ video_categories_text_component,
301
  confidence_threshold_component,
302
  iou_threshold_component,
303
  with_segmentation_component,
304
  with_confidence_component,
305
  with_class_agnostic_nms_component
306
  ],
307
+ outputs=output_video_component
308
  )
309
 
310
  demo.launch(debug=False, show_error=True)
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  inference-gpu[yolo-world]==0.9.13
2
  supervision==0.19.0rc3
3
- gradio==4.19.0
 
 
1
  inference-gpu[yolo-world]==0.9.13
2
  supervision==0.19.0rc3
3
+ gradio==4.19.0
4
+ tqdm==4.66.2
utils/efficient_sam.py CHANGED
@@ -45,3 +45,17 @@ def inference_with_box(
45
  max_predicted_iou = curr_predicted_iou
46
  selected_mask_using_predicted_iou = all_masks[m]
47
  return selected_mask_using_predicted_iou
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  max_predicted_iou = curr_predicted_iou
46
  selected_mask_using_predicted_iou = all_masks[m]
47
  return selected_mask_using_predicted_iou
48
+
49
+
50
+ def inference_with_boxes(
51
+ image: np.ndarray,
52
+ xyxy: np.ndarray,
53
+ model: torch.jit.ScriptModule,
54
+ device: torch.device
55
+ ) -> np.ndarray:
56
+ masks = []
57
+ for [x_min, y_min, x_max, y_max] in xyxy:
58
+ box = np.array([[x_min, y_min], [x_max, y_max]])
59
+ mask = inference_with_box(image, box, model, device)
60
+ masks.append(mask)
61
+ return np.array(masks)
utils/video.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import datetime
3
+ import uuid
4
+
5
+ import supervision as sv
6
+
7
+
8
+ MAX_VIDEO_LENGTH_SEC = 3
9
+
10
+
11
+ def generate_file_name(extension="mp4"):
12
+ current_datetime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
13
+ unique_id = uuid.uuid4()
14
+ return f"{current_datetime}_{unique_id}.{extension}"
15
+
16
+
17
+ def calculate_end_frame_index(source_video_path: str) -> int:
18
+ video_info = sv.VideoInfo.from_video_path(source_video_path)
19
+ return min(
20
+ video_info.total_frames,
21
+ video_info.fps * MAX_VIDEO_LENGTH_SEC
22
+ )
23
+
24
+
25
+ def create_directory(directory_path: str) -> None:
26
+ if not os.path.exists(directory_path):
27
+ os.makedirs(directory_path)