efraim1011 commited on
Commit
00fa262
1 Parent(s): a0159d8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +371 -0
app.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ import os
4
+ import cv2
5
+ import gradio as gr
6
+ import numpy as np
7
+ import supervision as sv
8
+ import torch
9
+ from tqdm import tqdm
10
+ import cv2
11
+ from translate import Translator
12
+
13
+ from inference.models.yolo_world.yolo_world import YOLOWorld
14
+
15
+
16
+ import datetime
17
+ import uuid
18
+ from typing import List
19
+
20
+
21
+ def generate_file_name(extension="mp4"):
22
+ current_datetime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
23
+ unique_id = uuid.uuid4()
24
+ return f"{current_datetime}_{unique_id}.{extension}"
25
+
26
+
27
+ def list_files_older_than(directory: str, diff_minutes: int) -> List[str]:
28
+ diff_seconds = diff_minutes * 60
29
+ now = datetime.datetime.now()
30
+ older_files: List[str] = []
31
+
32
+ for filename in os.listdir(directory):
33
+ file_path = os.path.join(directory, filename)
34
+ if os.path.isfile(file_path):
35
+ file_mod_time = os.path.getmtime(file_path)
36
+ file_mod_datetime = datetime.datetime.fromtimestamp(file_mod_time)
37
+ time_diff = now - file_mod_datetime
38
+ if time_diff.total_seconds() > diff_seconds:
39
+ older_files.append(file_path)
40
+
41
+ return older_files
42
+
43
+ def remove_files_older_than(directory: str, diff_minutes: int) -> None:
44
+ older_files = list_files_older_than(directory, diff_minutes)
45
+ file_count = len(older_files)
46
+
47
+ for file_path in older_files:
48
+ os.remove(file_path)
49
+
50
+ now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
51
+ print(
52
+ f"[{now}] Removed {file_count} files older than {diff_minutes} minutes from "
53
+ f"'{directory}' directory."
54
+ )
55
+
56
+
57
+ def calculate_end_frame_index(source_video_path: str) -> int:
58
+ video_info = sv.VideoInfo.from_video_path(source_video_path)
59
+ return video_info.total_frames
60
+
61
+ def create_directory(directory_path: str) -> None:
62
+ if not os.path.exists(directory_path):
63
+ os.makedirs(directory_path)
64
+
65
+
66
+ MARKDOWN = """
67
+ <h1>Porto do Itaqui YOLO-World </h1>
68
+
69
+
70
+ Este é protótipo em fase de execução que será apresentado ao porto do Itaqui com o objetivo de entregar alguma coisa.
71
+
72
+ """
73
+
74
+ RESULTS = "results"
75
+
76
+ # IMAGE_EXAMPLES = [
77
+ # ['https://media.roboflow.com/dog.jpeg', 'dog, eye, nose, tongue, car', 0.005, 0.1, True, False, False],
78
+ # ['https://media.roboflow.com/albert-4x.png', 'hand, hair', 0.005, 0.1, True, False, False],
79
+ # ]
80
+ # VIDEO_EXAMPLES = [
81
+ # ['https://media.roboflow.com/supervision/video-examples/croissant-1280x720.mp4', 'croissant', 0.01, 0.2, False, False, False],
82
+ # ['https://media.roboflow.com/supervision/video-examples/suitcases-1280x720.mp4', 'suitcase', 0.1, 0.2, False, False, False],
83
+ # ['https://media.roboflow.com/supervision/video-examples/tokyo-walk-1280x720.mp4', 'woman walking', 0.1, 0.2, False, False, False],
84
+ # ['https://media.roboflow.com/supervision/video-examples/wooly-mammoth-1280x720.mp4', 'mammoth', 0.01, 0.2, False, False, False],
85
+ # ]
86
+
87
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
88
+ # EFFICIENT_SAM_MODEL = load(device=DEVICE)
89
+ YOLO_WORLD_MODEL = YOLOWorld(model_id="yolo_world/l")
90
+
91
+ BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=2)
92
+ MASK_ANNOTATOR = sv.MaskAnnotator()
93
+ LABEL_ANNOTATOR = sv.LabelAnnotator(text_thickness=2, text_scale=1, text_color=sv.Color.BLACK)
94
+
95
+ # creating video results directory
96
+ create_directory(directory_path=RESULTS)
97
+
98
+
99
+ def process_categories(categories: str) -> List[str]:
100
+ # Traduzindo as palavras do português para o inglês
101
+ translator = Translator(from_lang="pt", to_lang="en")
102
+ translation = translator.translate(categories)
103
+
104
+ return [category.strip() for category in translation.split(',')]
105
+
106
+
107
+ def annotate_image(
108
+ input_image: np.ndarray,
109
+ detections: sv.Detections,
110
+ categories: List[str],
111
+ with_confidence: bool = False,
112
+ ) -> np.ndarray:
113
+ labels = [
114
+ (
115
+ f"{categories[class_id]}: {confidence:.3f}"
116
+ if with_confidence
117
+ else f"{categories[class_id]}"
118
+ )
119
+ for class_id, confidence in
120
+ zip(detections.class_id, detections.confidence)
121
+ ]
122
+ output_image = MASK_ANNOTATOR.annotate(input_image, detections)
123
+ output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
124
+ output_image = LABEL_ANNOTATOR.annotate(output_image, detections, labels=labels)
125
+ return output_image
126
+
127
+
128
+ def process_image(
129
+ input_image: np.ndarray,
130
+ categories: str,
131
+ confidence_threshold: float = 0.3,
132
+ iou_threshold: float = 0.5,
133
+ # with_segmentation: bool = True,
134
+ with_confidence: bool = False,
135
+ with_class_agnostic_nms: bool = False,
136
+ ) -> np.ndarray:
137
+ # cleanup of old video files
138
+ remove_files_older_than(RESULTS, 30)
139
+
140
+ categories = process_categories(categories)
141
+ YOLO_WORLD_MODEL.set_classes(categories)
142
+ results = YOLO_WORLD_MODEL.infer(input_image, confidence=0.02)
143
+ detections = sv.Detections.from_inference(results)
144
+ detections = detections.with_nms(
145
+ class_agnostic=with_class_agnostic_nms,
146
+ threshold=iou_threshold
147
+ )
148
+ # if with_segmentation:
149
+ # detections.mask = inference_with_boxes(
150
+ # image=input_image,
151
+ # xyxy=detections.xyxy,
152
+ # model=EFFICIENT_SAM_MODEL,
153
+ # device=DEVICE
154
+ # )
155
+ output_image = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR)
156
+ output_image = annotate_image(
157
+ input_image=output_image,
158
+ detections=detections,
159
+ categories=categories,
160
+ with_confidence=with_confidence
161
+ )
162
+ return cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)
163
+
164
+
165
+ def process_video(
166
+ input_video: str,
167
+ categories: str,
168
+ confidence_threshold: float = 0.3,
169
+ iou_threshold: float = 0.5,
170
+ # with_segmentation: bool = True,
171
+ with_confidence: bool = False,
172
+ with_class_agnostic_nms: bool = False,
173
+ progress=gr.Progress(track_tqdm=True)
174
+ ) -> str:
175
+ # cleanup of old video files
176
+ remove_files_older_than(RESULTS, 30)
177
+
178
+ categories = process_categories(categories)
179
+ YOLO_WORLD_MODEL.set_classes(categories)
180
+ video_info = sv.VideoInfo.from_video_path(input_video)
181
+ total = calculate_end_frame_index(input_video)
182
+ frame_generator = sv.get_video_frames_generator(
183
+ source_path=input_video,
184
+ end=total
185
+ )
186
+ result_file_name = generate_file_name(extension="mp4")
187
+ result_file_path = os.path.join(RESULTS, result_file_name)
188
+ with sv.VideoSink(result_file_path, video_info=video_info) as sink:
189
+ for _ in tqdm(range(total), desc="Processing video..."):
190
+ frame = next(frame_generator)
191
+ results = YOLO_WORLD_MODEL.infer(frame, confidence=confidence_threshold)
192
+ detections = sv.Detections.from_inference(results)
193
+ detections = detections.with_nms(
194
+ class_agnostic=with_class_agnostic_nms,
195
+ threshold=iou_threshold
196
+ )
197
+ # if with_segmentation:
198
+ # detections.mask = inference_with_boxes(
199
+ # image=frame,
200
+ # xyxy=detections.xyxy,
201
+ # model=EFFICIENT_SAM_MODEL,
202
+ # device=DEVICE
203
+ # )
204
+ frame = annotate_image(
205
+ input_image=frame,
206
+ detections=detections,
207
+ categories=categories,
208
+ with_confidence=with_confidence
209
+ )
210
+ sink.write_frame(frame)
211
+ return result_file_path
212
+
213
+
214
+ confidence_threshold_component = gr.Slider(
215
+ minimum=0,
216
+ maximum=1.0,
217
+ value=0.03,
218
+ step=0.01,
219
+ label="Limite de Confiança",
220
+ info=(
221
+ "O limite de confiança para o modelo YOLO-World. Reduza o limite para "
222
+ "reduzir falsos negativos, aumentando a sensibilidade do modelo para detectar "
223
+ "objetos procurados. Por outro lado, aumente o limite para minimizar falsos "
224
+ "positivos, evitando que o modelo identifique objetos que não deveria."
225
+ ))
226
+
227
+ iou_threshold_component = gr.Slider(
228
+ minimum=0,
229
+ maximum=1.0,
230
+ value=0.1,
231
+ step=0.01,
232
+ label="Limite IoU",
233
+ info=(
234
+ "Limite de intersecção sobre união (Intersection over Union ou IoU) para supressão não máxima. "
235
+ "Diminua o valor para diminuir a ocorrência de caixas delimitadoras sobrepostas, "
236
+ "tornando o processo de detecção mais rigoroso. Por outro lado, aumente o valor "
237
+ "para permitir mais caixas delimitadoras sobrepostas, acomodando uma gama mais ampla de "
238
+ "detecções."
239
+ ))
240
+
241
+ with_segmentation_component = gr.Checkbox(
242
+ value=False,
243
+ label="With Segmentation",
244
+ info=(
245
+ "Whether to run EfficientSAM for instance segmentation."
246
+ )
247
+ )
248
+
249
+ with_confidence_component = gr.Checkbox(
250
+ value=False,
251
+ label="Mostrar confiança.",
252
+ info=(
253
+ "Mostrar ou não a confiança dos objetos detectados."
254
+ )
255
+ )
256
+
257
+ with_class_agnostic_nms_component = gr.Checkbox(
258
+ value=False,
259
+ label="Use NMS (Non-Max Supression ou Supressão Não Máxima) independente da classe.",
260
+ info=(
261
+ "Suprima caixas delimitadoras sobrepostas em todas as classes."
262
+ )
263
+ )
264
+
265
+
266
+ with gr.Blocks() as demo:
267
+ gr.Markdown(MARKDOWN)
268
+ with gr.Accordion("Confiduração", open=False):
269
+ confidence_threshold_component.render()
270
+ iou_threshold_component.render()
271
+ with gr.Row():
272
+ # with_segmentation_component.render()
273
+ with_confidence_component.render()
274
+ with_class_agnostic_nms_component.render()
275
+ with gr.Tab(label="Imagem"):
276
+ with gr.Row():
277
+ input_image_component = gr.Image(
278
+ type='numpy',
279
+ label='Imagem de entrada'
280
+ )
281
+ output_image_component = gr.Image(
282
+ type='numpy',
283
+ label='Imagem de saída'
284
+ )
285
+ with gr.Row():
286
+ image_categories_text_component = gr.Textbox(
287
+ label='Categorias',
288
+ placeholder='Digite as categorias separadas por vírgula',
289
+ scale=7
290
+ )
291
+ image_submit_button_component = gr.Button(
292
+ value='Submeter',
293
+ scale=1,
294
+ variant='primary'
295
+ )
296
+ # gr.Examples(
297
+ # fn=process_image,
298
+ # examples=IMAGE_EXAMPLES,
299
+ # inputs=[
300
+ # input_image_component,
301
+ # image_categories_text_component,
302
+ # confidence_threshold_component,
303
+ # iou_threshold_component,
304
+ # with_segmentation_component,
305
+ # with_confidence_component,
306
+ # with_class_agnostic_nms_component
307
+ # ],
308
+ # outputs=output_image_component
309
+ # )
310
+ with gr.Tab(label="Video"):
311
+ with gr.Row():
312
+ input_video_component = gr.Video(
313
+ label='Video de entrada'
314
+ )
315
+ output_video_component = gr.Video(
316
+ label='Video de saída'
317
+ )
318
+ with gr.Row():
319
+ video_categories_text_component = gr.Textbox(
320
+ label='Categorias',
321
+ placeholder='Digite as categorias separadas por vírgula',
322
+ scale=7
323
+ )
324
+ video_submit_button_component = gr.Button(
325
+ value='Submeter',
326
+ scale=1,
327
+ variant='primary'
328
+ )
329
+ # gr.Examples(
330
+ # fn=process_video,
331
+ # examples=VIDEO_EXAMPLES,
332
+ # inputs=[
333
+ # input_video_component,
334
+ # video_categories_text_component,
335
+ # confidence_threshold_component,
336
+ # iou_threshold_component,
337
+ # with_segmentation_component,
338
+ # with_confidence_component,
339
+ # with_class_agnostic_nms_component
340
+ # ],
341
+ # outputs=output_image_component
342
+ # )
343
+
344
+ image_submit_button_component.click(
345
+ fn=process_image,
346
+ inputs=[
347
+ input_image_component,
348
+ image_categories_text_component,
349
+ confidence_threshold_component,
350
+ iou_threshold_component,
351
+ # with_segmentation_component,
352
+ with_confidence_component,
353
+ with_class_agnostic_nms_component
354
+ ],
355
+ outputs=output_image_component
356
+ )
357
+ video_submit_button_component.click(
358
+ fn=process_video,
359
+ inputs=[
360
+ input_video_component,
361
+ video_categories_text_component,
362
+ confidence_threshold_component,
363
+ iou_threshold_component,
364
+ # with_segmentation_component,
365
+ with_confidence_component,
366
+ with_class_agnostic_nms_component
367
+ ],
368
+ outputs=output_video_component
369
+ )
370
+
371
+ demo.launch(debug=False, show_error=True, max_threads=1)