IncreasingLoss commited on
Commit
ddb2570
·
verified ·
1 Parent(s): c7730f6

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +548 -532
app.py CHANGED
@@ -1,533 +1,549 @@
1
- """check torch"""
2
- import torch
3
- print(torch.__version__)
4
- print(torch.version.cuda)
5
- print(torch.backends.cudnn.enabled)
6
-
7
- """global variables"""
8
- user_device = "cuda" if torch.cuda.is_available() else "cpu"
9
-
10
- """Import libraries"""
11
- import cv2
12
- import os
13
- import glob
14
- from pathlib import Path
15
- from collections import Counter
16
- import gradio as gr
17
- from PIL import Image
18
- import numpy as np
19
- import time
20
-
21
- """yolo model loading"""
22
- from ultralytics import YOLO
23
- # Load model once at startup and keep in memory
24
- yoloV11_Small = YOLO("yolo_models_downloaded/yolo11s.pt", task="detect").to(user_device).eval()
25
-
26
- # Image preprocessing function
27
- def preprocess_image(image_path, max_size=640):
28
- """
29
- Resize image to have longest dimension = max_size while maintaining aspect ratio
30
- Returns the resized image and scale factor
31
- """
32
- try:
33
- # Read image
34
- img = cv2.imread(image_path)
35
- if img is None:
36
- raise IOError(f"Could not open image: {image_path}")
37
-
38
- # Get original dimensions
39
- h, w = img.shape[:2]
40
-
41
- # Calculate scale factor
42
- scale = min(max_size / max(h, w), 1.0) # Only scale down, not up
43
-
44
- # Skip resizing if image is already smaller than max_size
45
- if scale < 1.0:
46
- new_h, new_w = int(h * scale), int(w * scale)
47
- img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
48
-
49
- return img, scale
50
- except Exception as e:
51
- print(f"Error preprocessing image: {e}")
52
- return None, 1.0
53
-
54
- """ helper functions"""
55
- def return_attributes(result, frame_shape):
56
- """
57
- Returns a Dictionary with Lists:
58
- "labels", "left_right"
59
- """
60
- # Get frame dimensions
61
- if frame_shape is not None:
62
- try:
63
- height, width = frame_shape[:2] # Only get first two dimensions
64
- except (AttributeError, ValueError):
65
- height, width = 640, 640
66
- else:
67
- height, width = 640, 640
68
-
69
- # Initialize return lists
70
- centers = []
71
- l_m_r = []
72
- array_objects = []
73
-
74
- try:
75
- # Process box coordinates
76
- if hasattr(result, 'boxes') and hasattr(result.boxes, 'xywh'):
77
- for box in result.boxes.xywh:
78
- # Get x coordinate directly as float
79
- x = float(box[0].item() if hasattr(box[0], 'item') else box[0])
80
-
81
- # Determine position more efficiently
82
- if x < width * 0.25:
83
- l_m_r.append("left")
84
- elif x > width * 0.75:
85
- l_m_r.append("right")
86
- else:
87
- l_m_r.append("center")
88
-
89
- # Process class labels
90
- if hasattr(result, 'boxes') and hasattr(result.boxes, 'cls'):
91
- for index in result.boxes.cls:
92
- # Convert index to integer
93
- idx = int(index.item() if hasattr(index, 'item') else index)
94
- array_objects.append(str(result.names[idx]))
95
-
96
- except Exception as e:
97
- print(f"Error in return_attributes: {e}")
98
- return {"labels": [], "left_right": []}
99
-
100
- return {"labels": array_objects, "left_right": l_m_r}
101
-
102
- def sort_attributes(attributes):
103
- """Sort attributes based on position order"""
104
- # Define the desired order
105
- position_order = {'center': 0, 'left': 1, 'right': 2}
106
-
107
- # Combine the labels and positions into pairs
108
- combined = list(zip(attributes['left_right'], attributes['labels']))
109
-
110
- # Sort the combined list based on the defined position order
111
- sorted_combined = sorted(combined, key=lambda x: position_order.get(x[0], float('inf')))
112
-
113
- # Unzip the sorted pairs back into separate lists
114
- sorted_positions, sorted_labels = zip(*sorted_combined) if combined else ([], [])
115
-
116
- # Return the sorted attributes as a new dictionary
117
- return {
118
- 'labels': list(sorted_labels),
119
- 'left_right': list(sorted_positions)
120
- }
121
-
122
- def count_objects(labels, directions):
123
- """Group identical labels only when in the same direction"""
124
- # Count each (label, direction) pair
125
- pair_counts = Counter(zip(labels, directions))
126
-
127
- # Define direction ranking for sorting
128
- rank = {'center': 0, 'left': 1, 'right': 2}
129
-
130
- # Prepare a helper to pluralize labels when count > 1
131
- def pluralize(word, count):
132
- if count == 1:
133
- return f"a {word}"
134
- # simple English plural rules
135
- if any(word.endswith(s) for s in ('s','x','z','sh','ch')):
136
- return f"{count} {word}es"
137
- if word.endswith('y') and word[-2] not in 'aeiou':
138
- return f"{count} {word[:-1]}ies"
139
- return f"{count} {word}s"
140
-
141
- # Sort the unique (label, direction) keys by direction rank
142
- sorted_pairs = sorted(pair_counts.items(),
143
- key=lambda x: rank.get(x[0][1], float('inf')))
144
-
145
- # Build the output lists
146
- out_labels = []
147
- out_dirs = []
148
- for (label, direction), cnt in sorted_pairs:
149
- out_labels.append(pluralize(label, cnt))
150
- out_dirs.append(direction)
151
-
152
- return {'labels': out_labels, 'left_right': out_dirs}
153
-
154
- def join_items(items):
155
- """Join a list of strings into a human-readable list"""
156
- if len(items) == 1:
157
- return items[0]
158
- if len(items) == 2:
159
- return f"{items[0]} and {items[1]}"
160
- return ", ".join(items[:-1]) + " and " + items[-1]
161
-
162
- def make_sentence(attrib_dict):
163
- """Build a sentence from attribute dictionary"""
164
- labels = attrib_dict["labels"]
165
- directions = attrib_dict["left_right"]
166
-
167
- # Return early if no objects detected
168
- if not labels:
169
- return "No objects detected."
170
-
171
- # Group labels by direction
172
- grouped = {"center": [], "left": [], "right": []}
173
- for lbl, dr in zip(labels, directions):
174
- if dr in grouped:
175
- grouped[dr].append(lbl)
176
-
177
- parts = []
178
- has_center_objects = bool(grouped["center"])
179
-
180
- # Front clause (center)
181
- front = grouped["center"]
182
- if front:
183
- verb = "is" if len(front) == 1 and not front[0].startswith(('2 ', '3 ', '4 ')) else "are"
184
- parts.append(f"There {verb} {join_items(front)} in front of you")
185
-
186
- # Side clauses (left, right)
187
- side_parts = []
188
- for side in ("left", "right"):
189
- items = grouped[side]
190
- if items:
191
- if len(items) == 1:
192
- item = items[0]
193
- if item.startswith("a "):
194
- side_parts.append(f"There is {item} to your {side}")
195
- else:
196
- verb = "is" if item.startswith(("1 ", "one ")) else "are"
197
- side_parts.append(f"There {verb} {item} to your {side}")
198
- else:
199
- side_parts.append(f"There are {join_items(items)} to your {side}")
200
-
201
- # Determine whether to use "also"
202
- if side_parts and has_center_objects:
203
- # Modify first side part to include "also"
204
- if side_parts:
205
- first_side = side_parts[0]
206
- words = first_side.split(" ", 2)
207
- if len(words) >= 2:
208
- side_parts[0] = f"{words[0]} also {words[1]} {words[2]}"
209
-
210
- parts.extend(side_parts)
211
-
212
- # Join all parts with periods
213
- return ". ".join(parts) + "."
214
-
215
- def draw_detection(frame, result, display_labels=True, left_right_lines=True, scale=1.0):
216
- """Draw bounding boxes and labels on the frame with proper scaling"""
217
- try:
218
- # Use numpy arrays directly when possible to avoid unnecessary data transfers
219
- if hasattr(result.boxes, 'xyxy') and hasattr(result.boxes.xyxy, 'cpu'):
220
- boxes = result.boxes.xyxy.cpu().numpy()
221
- else:
222
- boxes = result.boxes.xyxy.numpy() if hasattr(result.boxes.xyxy, 'numpy') else result.boxes.xyxy
223
-
224
- if hasattr(result.boxes, 'cls') and hasattr(result.boxes.cls, 'cpu'):
225
- class_ids = result.boxes.cls.cpu().numpy().astype(int)
226
- else:
227
- class_ids = result.boxes.cls.numpy().astype(int) if hasattr(result.boxes.cls, 'numpy') else result.boxes.cls.astype(int)
228
-
229
- class_names = result.names
230
-
231
- # If image was resized, adjust the boxes back to original size
232
- # The boxes are in the coordinates of the resized image, so we need to scale them back
233
- if scale != 1.0:
234
- boxes = boxes / scale
235
-
236
- for index in range(len(boxes)):
237
- box = boxes[index]
238
- class_id = class_ids[index]
239
- label = str(class_names[class_id])
240
-
241
- # Ensure box coordinates are integers
242
- x1, y1, x2, y2 = map(int, box)
243
-
244
- # Draw rectangle
245
- cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
246
-
247
- if display_labels:
248
- # Add a background for text to make it more readable
249
- text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.75, 2)[0]
250
- cv2.rectangle(frame, (x1, y1-text_size[1]-10), (x1+text_size[0], y1), (0, 0, 0), -1)
251
- cv2.putText(frame, label, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 255, 0), 2)
252
-
253
- if left_right_lines:
254
- h, w = frame.shape[:2]
255
- cv2.line(frame, (int(w*.25), 0), (int(w*0.25), h), (0, 255, 0), 2)
256
- cv2.line(frame, (int(w*.75), 0), (int(w*0.75), h), (0, 255, 0), 2)
257
-
258
- except Exception as e:
259
- print(f"Error drawing detections: {e}")
260
- import traceback
261
- traceback.print_exc()
262
-
263
- def add_sentence_to_image(frame, sentence):
264
- """Add descriptive sentence to bottom of image"""
265
- if not sentence:
266
- return
267
-
268
- h, w = frame.shape[:2]
269
-
270
- # Calculate font size based on image dimensions
271
- font_scale = min(w, h) / 1000 # Scale font with image size
272
- font_scale = max(0.6, min(5, font_scale)) # Keep between 0.6 and 1.2
273
-
274
- # Create a semi-transparent background for better text visibility
275
- font = cv2.FONT_HERSHEY_SIMPLEX
276
- text_size = cv2.getTextSize(sentence, font, font_scale, 2)[0]
277
-
278
- # Split text into multiple lines if too long
279
- max_width = int(w * 0.9) # Maximum width for text is 90% of image width
280
- if text_size[0] > max_width:
281
- words = sentence.split()
282
- lines = []
283
- current_line = []
284
-
285
- for word in words:
286
- test_line = ' '.join(current_line + [word])
287
- test_size = cv2.getTextSize(test_line, font, font_scale, 2)[0]
288
-
289
- if test_size[0] <= max_width:
290
- current_line.append(word)
291
- else:
292
- lines.append(' '.join(current_line))
293
- current_line = [word]
294
-
295
- if current_line:
296
- lines.append(' '.join(current_line))
297
-
298
- # Draw background and text for each line
299
- padding = 10
300
- line_height = text_size[1] + padding
301
- bg_height = (line_height * len(lines)) + padding * 2
302
-
303
- # Create semi-transparent overlay for text background
304
- overlay = frame.copy()
305
- bg_y1 = h - bg_height
306
- cv2.rectangle(overlay, (0, bg_y1), (w, h), (0, 0, 0), -1)
307
- cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
308
-
309
- # Draw each line of text
310
- for i, line in enumerate(lines):
311
- y_pos = bg_y1 + padding + (i + 1) * line_height - padding//2
312
- cv2.putText(frame, line, (padding, y_pos), font, font_scale, (255, 255, 255), 2)
313
- else:
314
- # For short sentences, just add a single line of text
315
- padding = 10
316
-
317
- # Create semi-transparent overlay for text background
318
- overlay = frame.copy()
319
- bg_y1 = h - text_size[1] - padding * 3
320
- cv2.rectangle(overlay, (0, bg_y1), (w, h), (0, 0, 0), -1)
321
- cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
322
-
323
- # Position text in bottom right with padding
324
- cv2.putText(frame, sentence, (padding, h - padding), font, font_scale, (255, 255, 255), 2)
325
-
326
- """process audio / video"""
327
- def process_image(image_path, confidence=0.5):
328
- """
329
- Process a single image file with YOLO detection
330
- Adds a descriptive sentence at the bottom of the image
331
- Returns the processed image
332
- """
333
- if not image_path:
334
- return None
335
-
336
- try:
337
- start_time = time.time()
338
-
339
- # Store original image for drawing results
340
- original_img = cv2.imread(image_path)
341
- if original_img is None:
342
- return None
343
-
344
- # Preprocess image - resize to max 640px on longest side for YOLO processing
345
- processed_img, scale_factor = preprocess_image(image_path, max_size=640)
346
- if processed_img is None:
347
- return None
348
-
349
- # Run YOLO inference with optimized settings on the RESIZED image
350
- results = yoloV11_Small(
351
- processed_img,
352
- conf=confidence,
353
- verbose=False,
354
- iou=0.5,
355
- agnostic_nms=True,
356
- device=user_device
357
- )
358
-
359
- # Create a copy of the ORIGINAL image for drawing
360
- output_frame = original_img.copy()
361
-
362
- # Process the detections
363
- current_sentence = "No objects detected."
364
- for result in results:
365
- # Get original frame shape for attribute calculation
366
- frame_shape = processed_img.shape # Use the processed image shape for attribute calculation
367
-
368
- # Draw detections on the ORIGINAL frame with proper scaling
369
- draw_detection(output_frame, result, display_labels=True, left_right_lines=True, scale=scale_factor)
370
-
371
- # Process attributes based on the PROCESSED image (as that's what YOLO saw)
372
- attributes = return_attributes(result, frame_shape)
373
-
374
- if attributes["labels"]:
375
- attrs = sort_attributes(attributes)
376
- counted = count_objects(attrs["labels"], attrs["left_right"])
377
- current_sentence = make_sentence(counted)
378
-
379
- # Add the sentence to the image
380
- add_sentence_to_image(output_frame, current_sentence)
381
-
382
- # Convert from BGR to RGB for displaying in Gradio
383
- output_frame_rgb = cv2.cvtColor(output_frame, cv2.COLOR_BGR2RGB)
384
-
385
- end_time = time.time()
386
- print(f"Processing time: {end_time - start_time:.3f} seconds")
387
-
388
- return output_frame_rgb
389
-
390
- except Exception as e:
391
- print(f"Image processing error: {e}")
392
- import traceback
393
- traceback.print_exc()
394
- return None
395
-
396
-
397
- """gradio app for image processing"""
398
-
399
- EXAMPLES_FOLDER = "examples"
400
- TEMP_FILES_FOLDER = Path("temp_files")
401
-
402
- # Ensure temp folder exists
403
- if not os.path.exists(TEMP_FILES_FOLDER):
404
- os.makedirs(TEMP_FILES_FOLDER)
405
-
406
- # Function to load example images from a folder
407
- def load_examples_from_folder(folder_path=EXAMPLES_FOLDER):
408
- """Load all images from a specified folder as examples"""
409
- if not os.path.exists(folder_path):
410
- os.makedirs(folder_path)
411
- print(f"Created examples folder at {folder_path}")
412
- return []
413
-
414
- # Valid file extensions for images only
415
- image_extensions = [".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"]
416
-
417
- # Get all image files in the folder efficiently
418
- example_files = []
419
- for ext in image_extensions:
420
- example_files.extend(glob.glob(os.path.join(folder_path, f"*{ext}")))
421
- example_files.extend(glob.glob(os.path.join(folder_path, f"*{ext.upper()}")))
422
-
423
- print(f"Loaded {len(example_files)} example images from {folder_path}")
424
- return example_files
425
-
426
- # Load example images once at startup
427
- raw = load_examples_from_folder()
428
- unique = list(dict.fromkeys(raw)) # remove duplicates
429
- example_images = [(p, None) for p in unique] # tell Gallery about each image
430
-
431
- def select_from_gallery(evt: gr.SelectData):
432
- """Handle selection from the gallery"""
433
- selected_path = example_images[evt.index][0]
434
- return selected_path
435
-
436
- custom_css = """
437
- /* Allow auto height for content areas */
438
- .fixed-height.svelte-842rpi.svelte-842rpi {
439
- min-height: 0 !important;
440
- max-height: none !important;
441
- height: auto !important;
442
- }
443
-
444
- /* Maintain responsive behavior */
445
- @media (min-width: 1280px) {
446
- .fixed-height.svelte-842rpi.svelte-842rpi {
447
- min-height: 0 !important;
448
- max-height: none !important;
449
- height: auto !important;
450
- }
451
- }
452
- """
453
-
454
- # Create the Gradio interface
455
- with gr.Blocks(title="Object Detection and Description", css=custom_css) as demo:
456
- gr.Markdown("## 🔍 Object Detection with Scene Description")
457
- gr.Markdown("Upload or select an image to detect objects and get a descriptive sentence.")
458
-
459
- # Store current image path
460
- current_image = gr.State()
461
-
462
- with gr.Row():
463
- with gr.Column(scale=1):
464
- # Image input component
465
- image_input = gr.Image(label="Upload Image", type="filepath")
466
-
467
- # Controls
468
- upload_btn = gr.UploadButton(
469
- "Upload Image",
470
- file_types=["image"]
471
- )
472
-
473
- confidence = gr.Slider(
474
- minimum=0.1,
475
- maximum=0.9,
476
- value=0.5,
477
- step=0.025,
478
- label="Detection Confidence"
479
- )
480
-
481
- with gr.Column(scale=1):
482
- # Output component
483
- image_output = gr.Image(label="Scene Analysis")
484
-
485
- # Example gallery at bottom
486
- with gr.Row(variant="panel"):
487
- if example_images:
488
- examples_gallery = gr.Gallery(
489
- value=example_images,
490
- label=f"Example Images (Click to Select) - {len(example_images)} examples from {EXAMPLES_FOLDER}",
491
- columns=6,
492
- elem_id="image_gallery",
493
- allow_preview=False,
494
- elem_classes=["centered-examples"]
495
- )
496
- else:
497
- gr.Markdown(
498
- f"No example images found in {EXAMPLES_FOLDER} folder. Add image files to see examples."
499
- )
500
-
501
- # Connect components
502
- upload_btn.upload(
503
- fn=lambda file_obj: file_obj.name if hasattr(file_obj, 'name') else str(file_obj),
504
- inputs=[upload_btn],
505
- outputs=[image_input]
506
- )
507
-
508
- image_input.change(
509
- fn=lambda x: x,
510
- inputs=[image_input],
511
- outputs=[current_image]
512
- )
513
-
514
- if example_images:
515
- examples_gallery.select(
516
- fn=select_from_gallery,
517
- outputs=[image_input]
518
- )
519
-
520
- # Process immediately when image is uploaded
521
- image_input.change(
522
- fn=process_image,
523
- inputs=[image_input, confidence],
524
- outputs=[image_output]
525
- )
526
-
527
- if __name__ == "__main__":
528
- # Create examples folder if it doesn't exist
529
- if not os.path.exists(EXAMPLES_FOLDER):
530
- os.makedirs(EXAMPLES_FOLDER)
531
- print(f"Created examples folder at {EXAMPLES_FOLDER}")
532
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
533
  demo.launch(share=False)
 
1
+ """check torch"""
2
+ import torch
3
+ print(torch.__version__)
4
+ print(torch.version.cuda)
5
+ print(torch.backends.cudnn.enabled)
6
+
7
+ """global variables"""
8
+ user_device = "cuda" if torch.cuda.is_available() else "cpu"
9
+
10
+ """Import libraries"""
11
+ import cv2
12
+ import os
13
+ import glob
14
+ from pathlib import Path
15
+ from collections import Counter
16
+ import gradio as gr
17
+ from PIL import Image
18
+ import numpy as np
19
+ import time
20
+
21
+ """yolo model loading"""
22
+ from ultralytics import YOLO
23
+ # Load model once at startup and keep in memory
24
+ yoloV11_Small = YOLO("yolo_models_downloaded/yolo11s.pt", task="detect").to(user_device).eval()
25
+
26
+ # Image preprocessing function
27
+ def preprocess_image(image_path, max_size=640):
28
+ """
29
+ Resize image to have longest dimension = max_size while maintaining aspect ratio
30
+ Returns the resized image and scale factor
31
+ """
32
+ try:
33
+ # Read image
34
+ img = cv2.imread(image_path)
35
+ if img is None:
36
+ raise IOError(f"Could not open image: {image_path}")
37
+
38
+ # Get original dimensions
39
+ h, w = img.shape[:2]
40
+
41
+ # Calculate scale factor
42
+ scale = min(max_size / max(h, w), 1.0) # Only scale down, not up
43
+
44
+ # Skip resizing if image is already smaller than max_size
45
+ if scale < 1.0:
46
+ new_h, new_w = int(h * scale), int(w * scale)
47
+ img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
48
+
49
+ return img, scale
50
+ except Exception as e:
51
+ print(f"Error preprocessing image: {e}")
52
+ return None, 1.0
53
+
54
+ """ helper functions"""
55
+ def return_attributes(result, frame_shape):
56
+ """
57
+ Returns a Dictionary with Lists:
58
+ "labels", "left_right"
59
+ """
60
+ # Get frame dimensions
61
+ if frame_shape is not None:
62
+ try:
63
+ height, width = frame_shape[:2] # Only get first two dimensions
64
+ except (AttributeError, ValueError):
65
+ height, width = 640, 640
66
+ else:
67
+ height, width = 640, 640
68
+
69
+ # Initialize return lists
70
+ centers = []
71
+ l_m_r = []
72
+ array_objects = []
73
+
74
+ try:
75
+ # Process box coordinates
76
+ if hasattr(result, 'boxes') and hasattr(result.boxes, 'xywh'):
77
+ for box in result.boxes.xywh:
78
+ # Get x coordinate directly as float
79
+ x = float(box[0].item() if hasattr(box[0], 'item') else box[0])
80
+
81
+ # Determine position more efficiently
82
+ if x < width * 0.25:
83
+ l_m_r.append("left")
84
+ elif x > width * 0.75:
85
+ l_m_r.append("right")
86
+ else:
87
+ l_m_r.append("center")
88
+
89
+ # Process class labels
90
+ if hasattr(result, 'boxes') and hasattr(result.boxes, 'cls'):
91
+ for index in result.boxes.cls:
92
+ # Convert index to integer
93
+ idx = int(index.item() if hasattr(index, 'item') else index)
94
+ array_objects.append(str(result.names[idx]))
95
+
96
+ except Exception as e:
97
+ print(f"Error in return_attributes: {e}")
98
+ return {"labels": [], "left_right": []}
99
+
100
+ return {"labels": array_objects, "left_right": l_m_r}
101
+
102
+ def sort_attributes(attributes):
103
+ """Sort attributes based on position order"""
104
+ # Define the desired order
105
+ position_order = {'center': 0, 'left': 1, 'right': 2}
106
+
107
+ # Combine the labels and positions into pairs
108
+ combined = list(zip(attributes['left_right'], attributes['labels']))
109
+
110
+ # Sort the combined list based on the defined position order
111
+ sorted_combined = sorted(combined, key=lambda x: position_order.get(x[0], float('inf')))
112
+
113
+ # Unzip the sorted pairs back into separate lists
114
+ sorted_positions, sorted_labels = zip(*sorted_combined) if combined else ([], [])
115
+
116
+ # Return the sorted attributes as a new dictionary
117
+ return {
118
+ 'labels': list(sorted_labels),
119
+ 'left_right': list(sorted_positions)
120
+ }
121
+
122
+ def count_objects(labels, directions):
123
+ """Group identical labels only when in the same direction"""
124
+ # Count each (label, direction) pair
125
+ pair_counts = Counter(zip(labels, directions))
126
+
127
+ # Define direction ranking for sorting
128
+ rank = {'center': 0, 'left': 1, 'right': 2}
129
+
130
+ # Prepare a helper to pluralize labels when count > 1
131
+ def pluralize(word, count):
132
+ if count == 1:
133
+ return f"a {word}"
134
+ # simple English plural rules
135
+ if any(word.endswith(s) for s in ('s','x','z','sh','ch')):
136
+ return f"{count} {word}es"
137
+ if word.endswith('y') and word[-2] not in 'aeiou':
138
+ return f"{count} {word[:-1]}ies"
139
+ return f"{count} {word}s"
140
+
141
+ # Sort the unique (label, direction) keys by direction rank
142
+ sorted_pairs = sorted(pair_counts.items(),
143
+ key=lambda x: rank.get(x[0][1], float('inf')))
144
+
145
+ # Build the output lists
146
+ out_labels = []
147
+ out_dirs = []
148
+ for (label, direction), cnt in sorted_pairs:
149
+ out_labels.append(pluralize(label, cnt))
150
+ out_dirs.append(direction)
151
+
152
+ return {'labels': out_labels, 'left_right': out_dirs}
153
+
154
+ def join_items(items):
155
+ """Join a list of strings into a human-readable list"""
156
+ if len(items) == 1:
157
+ return items[0]
158
+ if len(items) == 2:
159
+ return f"{items[0]} and {items[1]}"
160
+ return ", ".join(items[:-1]) + " and " + items[-1]
161
+
162
+ def make_sentence(attrib_dict):
163
+ """Build a sentence from attribute dictionary"""
164
+ labels = attrib_dict["labels"]
165
+ directions = attrib_dict["left_right"]
166
+
167
+ # Return early if no objects detected
168
+ if not labels:
169
+ return "No objects detected."
170
+
171
+ # Group labels by direction
172
+ grouped = {"center": [], "left": [], "right": []}
173
+ for lbl, dr in zip(labels, directions):
174
+ if dr in grouped:
175
+ grouped[dr].append(lbl)
176
+
177
+ parts = []
178
+ has_center_objects = bool(grouped["center"])
179
+
180
+ # Front clause (center)
181
+ front = grouped["center"]
182
+ if front:
183
+ verb = "is" if len(front) == 1 and not front[0].startswith(('2 ', '3 ', '4 ')) else "are"
184
+ parts.append(f"There {verb} {join_items(front)} in front of you")
185
+
186
+ # Side clauses (left, right)
187
+ side_parts = []
188
+ for side in ("left", "right"):
189
+ items = grouped[side]
190
+ if items:
191
+ if len(items) == 1:
192
+ item = items[0]
193
+ if item.startswith("a "):
194
+ side_parts.append(f"There is {item} to your {side}")
195
+ else:
196
+ verb = "is" if item.startswith(("1 ", "one ")) else "are"
197
+ side_parts.append(f"There {verb} {item} to your {side}")
198
+ else:
199
+ side_parts.append(f"There are {join_items(items)} to your {side}")
200
+
201
+ # Determine whether to use "also"
202
+ if side_parts and has_center_objects:
203
+ # Modify first side part to include "also"
204
+ if side_parts:
205
+ first_side = side_parts[0]
206
+ words = first_side.split(" ", 2)
207
+ if len(words) >= 2:
208
+ side_parts[0] = f"{words[0]} also {words[1]} {words[2]}"
209
+
210
+ parts.extend(side_parts)
211
+
212
+ # Join all parts with periods
213
+ return ". ".join(parts) + "."
214
+
215
+ def draw_detection(frame, result, display_labels=True, left_right_lines=True, scale=1.0):
216
+ """Draw bounding boxes and labels on the frame with proper scaling"""
217
+ try:
218
+ # Use numpy arrays directly when possible to avoid unnecessary data transfers
219
+ if hasattr(result.boxes, 'xyxy') and hasattr(result.boxes.xyxy, 'cpu'):
220
+ boxes = result.boxes.xyxy.cpu().numpy()
221
+ else:
222
+ boxes = result.boxes.xyxy.numpy() if hasattr(result.boxes.xyxy, 'numpy') else result.boxes.xyxy
223
+
224
+ if hasattr(result.boxes, 'cls') and hasattr(result.boxes.cls, 'cpu'):
225
+ class_ids = result.boxes.cls.cpu().numpy().astype(int)
226
+ else:
227
+ class_ids = result.boxes.cls.numpy().astype(int) if hasattr(result.boxes.cls, 'numpy') else result.boxes.cls.astype(int)
228
+
229
+ class_names = result.names
230
+
231
+ # If image was resized, adjust the boxes back to original size
232
+ # The boxes are in the coordinates of the resized image, so we need to scale them back
233
+ if scale != 1.0:
234
+ boxes = boxes / scale
235
+
236
+ for index in range(len(boxes)):
237
+ box = boxes[index]
238
+ class_id = class_ids[index]
239
+ label = str(class_names[class_id])
240
+
241
+ # Ensure box coordinates are integers
242
+ x1, y1, x2, y2 = map(int, box)
243
+
244
+ # Draw rectangle
245
+ cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
246
+
247
+ if display_labels:
248
+ # Add a background for text to make it more readable
249
+ text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.75, 2)[0]
250
+ cv2.rectangle(frame, (x1, y1-text_size[1]-10), (x1+text_size[0], y1), (0, 0, 0), -1)
251
+ cv2.putText(frame, label, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 255, 0), 2)
252
+
253
+ if left_right_lines:
254
+ h, w = frame.shape[:2]
255
+ cv2.line(frame, (int(w*.25), 0), (int(w*0.25), h), (0, 255, 0), 2)
256
+ cv2.line(frame, (int(w*.75), 0), (int(w*0.75), h), (0, 255, 0), 2)
257
+
258
+ except Exception as e:
259
+ print(f"Error drawing detections: {e}")
260
+ import traceback
261
+ traceback.print_exc()
262
+
263
+ def add_sentence_to_image(frame, sentence):
264
+ """Add descriptive sentence to bottom of image"""
265
+ if not sentence:
266
+ return
267
+
268
+ h, w = frame.shape[:2]
269
+
270
+ # Calculate font size based on image dimensions
271
+ font_scale = min(w, h) / 1000 # Scale font with image size
272
+ font_scale = max(0.6, min(5, font_scale)) # Keep between 0.6 and 1.2
273
+
274
+ # Create a semi-transparent background for better text visibility
275
+ font = cv2.FONT_HERSHEY_SIMPLEX
276
+ text_size = cv2.getTextSize(sentence, font, font_scale, 2)[0]
277
+
278
+ # Split text into multiple lines if too long
279
+ max_width = int(w * 0.9) # Maximum width for text is 90% of image width
280
+ if text_size[0] > max_width:
281
+ words = sentence.split()
282
+ lines = []
283
+ current_line = []
284
+
285
+ for word in words:
286
+ test_line = ' '.join(current_line + [word])
287
+ test_size = cv2.getTextSize(test_line, font, font_scale, 2)[0]
288
+
289
+ if test_size[0] <= max_width:
290
+ current_line.append(word)
291
+ else:
292
+ lines.append(' '.join(current_line))
293
+ current_line = [word]
294
+
295
+ if current_line:
296
+ lines.append(' '.join(current_line))
297
+
298
+ # Draw background and text for each line
299
+ padding = 10
300
+ line_height = text_size[1] + padding
301
+ bg_height = (line_height * len(lines)) + padding * 2
302
+
303
+ # Create semi-transparent overlay for text background
304
+ overlay = frame.copy()
305
+ bg_y1 = h - bg_height
306
+ cv2.rectangle(overlay, (0, bg_y1), (w, h), (0, 0, 0), -1)
307
+ cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
308
+
309
+ # Draw each line of text
310
+ for i, line in enumerate(lines):
311
+ y_pos = bg_y1 + padding + (i + 1) * line_height - padding//2
312
+ cv2.putText(frame, line, (padding, y_pos), font, font_scale, (255, 255, 255), 2)
313
+ else:
314
+ # For short sentences, just add a single line of text
315
+ padding = 10
316
+
317
+ # Create semi-transparent overlay for text background
318
+ overlay = frame.copy()
319
+ bg_y1 = h - text_size[1] - padding * 3
320
+ cv2.rectangle(overlay, (0, bg_y1), (w, h), (0, 0, 0), -1)
321
+ cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
322
+
323
+ # Position text in bottom right with padding
324
+ cv2.putText(frame, sentence, (padding, h - padding), font, font_scale, (255, 255, 255), 2)
325
+
326
+ """process audio / video"""
327
+ def process_image(image_path, confidence=0.5):
328
+ """
329
+ Process a single image file with YOLO detection
330
+ Adds a descriptive sentence at the bottom of the image
331
+ Returns the processed image
332
+ """
333
+ if not image_path:
334
+ return None
335
+
336
+ try:
337
+ start_time = time.time()
338
+
339
+ # Store original image for drawing results
340
+ original_img = cv2.imread(image_path)
341
+ if original_img is None:
342
+ return None
343
+
344
+ # Preprocess image - resize to max 640px on longest side for YOLO processing
345
+ processed_img, scale_factor = preprocess_image(image_path, max_size=640)
346
+ if processed_img is None:
347
+ return None
348
+
349
+ # Run YOLO inference with optimized settings on the RESIZED image
350
+ results = yoloV11_Small(
351
+ processed_img,
352
+ conf=confidence,
353
+ verbose=False,
354
+ iou=0.5,
355
+ agnostic_nms=True,
356
+ device=user_device
357
+ )
358
+
359
+ # Create a copy of the ORIGINAL image for drawing
360
+ output_frame = original_img.copy()
361
+
362
+ # Process the detections
363
+ current_sentence = "No objects detected."
364
+ for result in results:
365
+ # Get original frame shape for attribute calculation
366
+ frame_shape = processed_img.shape # Use the processed image shape for attribute calculation
367
+
368
+ # Draw detections on the ORIGINAL frame with proper scaling
369
+ draw_detection(output_frame, result, display_labels=True, left_right_lines=True, scale=scale_factor)
370
+
371
+ # Process attributes based on the PROCESSED image (as that's what YOLO saw)
372
+ attributes = return_attributes(result, frame_shape)
373
+
374
+ if attributes["labels"]:
375
+ attrs = sort_attributes(attributes)
376
+ counted = count_objects(attrs["labels"], attrs["left_right"])
377
+ current_sentence = make_sentence(counted)
378
+
379
+ # Add the sentence to the image
380
+ add_sentence_to_image(output_frame, current_sentence)
381
+
382
+ # Convert from BGR to RGB for displaying in Gradio
383
+ output_frame_rgb = cv2.cvtColor(output_frame, cv2.COLOR_BGR2RGB)
384
+
385
+ end_time = time.time()
386
+ print(f"Processing time: {end_time - start_time:.3f} seconds")
387
+
388
+ return output_frame_rgb
389
+
390
+ except Exception as e:
391
+ print(f"Image processing error: {e}")
392
+ import traceback
393
+ traceback.print_exc()
394
+ return None
395
+
396
+
397
+ """gradio app for image processing"""
398
+
399
+ EXAMPLES_FOLDER = "examples"
400
+ TEMP_FILES_FOLDER = Path("temp_files")
401
+
402
+ # Ensure temp folder exists
403
+ if not os.path.exists(TEMP_FILES_FOLDER):
404
+ os.makedirs(TEMP_FILES_FOLDER)
405
+
406
+ # Function to load example images from a folder
407
+ def load_examples_from_folder(folder_path=EXAMPLES_FOLDER):
408
+ """Load all images from a specified folder as examples"""
409
+ if not os.path.exists(folder_path):
410
+ os.makedirs(folder_path)
411
+ print(f"Created examples folder at {folder_path}")
412
+ return []
413
+
414
+ # Valid file extensions for images only
415
+ image_extensions = [".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"]
416
+
417
+ # Get all image files in the folder efficiently
418
+ example_files = []
419
+ for ext in image_extensions:
420
+ example_files.extend(glob.glob(os.path.join(folder_path, f"*{ext}")))
421
+ example_files.extend(glob.glob(os.path.join(folder_path, f"*{ext.upper()}")))
422
+
423
+ print(f"Loaded {len(example_files)} example images from {folder_path}")
424
+ return example_files
425
+
426
+ # Load example images once at startup
427
+ raw = load_examples_from_folder()
428
+ unique = list(dict.fromkeys(raw)) # remove duplicates
429
+ example_images = [(p, None) for p in unique] # tell Gallery about each image
430
+
431
+ def select_from_gallery(evt: gr.SelectData):
432
+ """Handle selection from the gallery"""
433
+ selected_path = example_images[evt.index][0]
434
+ return selected_path
435
+
436
+ custom_css = """
437
+ /* Allow auto height for content areas */
438
+ .fixed-height.svelte-842rpi.svelte-842rpi {
439
+ min-height: 0 !important;
440
+ max-height: none !important;
441
+ height: auto !important;
442
+ }
443
+
444
+ /* Maintain responsive behavior */
445
+ @media (min-width: 1280px) {
446
+ .fixed-height.svelte-842rpi.svelte-842rpi {
447
+ min-height: 0 !important;
448
+ max-height: none !important;
449
+ height: auto !important;
450
+ }
451
+ }
452
+ """
453
+
454
+ # Create the Gradio interface
455
+ with gr.Blocks(title="VoiceView: Object Detection and Description", css=custom_css) as demo:
456
+ gr.Markdown("## 🔍 VoiceView: Object Detection with Scene Description")
457
+ gr.Markdown("Upload or select an image to detect objects and get a descriptive sentence.")
458
+
459
+ # Store current image path
460
+ current_image = gr.State()
461
+
462
+ with gr.Row():
463
+ with gr.Column(scale=1):
464
+ # Image input component
465
+ image_input = gr.Image(label="Upload Image", type="filepath")
466
+
467
+ # Controls
468
+ upload_btn = gr.UploadButton(
469
+ "Upload Image",
470
+ file_types=["image"]
471
+ )
472
+
473
+ confidence = gr.Slider(
474
+ minimum=0.1,
475
+ maximum=0.9,
476
+ value=0.5,
477
+ step=0.025,
478
+ label="Detection Confidence"
479
+ )
480
+
481
+ with gr.Column(scale=1):
482
+ # Output component
483
+ image_output = gr.Image(label="Scene Analysis")
484
+
485
+ # Example gallery at bottom
486
+ with gr.Row(variant="panel"):
487
+ if example_images:
488
+ examples_gallery = gr.Gallery(
489
+ value=example_images,
490
+ label=f"Example Images (Click to Select) - {len(example_images)} examples from {EXAMPLES_FOLDER}",
491
+ columns=6,
492
+ elem_id="image_gallery",
493
+ allow_preview=False,
494
+ elem_classes=["centered-examples"]
495
+ )
496
+ else:
497
+ gr.Markdown(
498
+ f"No example images found in {EXAMPLES_FOLDER} folder. Add image files to see examples."
499
+ )
500
+ with gr.Column(variant="panel", scale=1):
501
+ gr.Markdown("## Video Version")
502
+ gr.Markdown("#### Unfortunately, this only runs locally due to latency when uploading images to Spaces.")
503
+ gr.Markdown("#### Test it yourself: download the `examples_video` folder and `app_local.py`.")
504
+ gr.Markdown("#### Don’t forget to install the required dependencies! :D")
505
+
506
+ with gr.Row(variant="panel"):
507
+
508
+ with gr.Column(scale=1):
509
+ gr.Markdown("### Input-Videos")
510
+ gr.Video("examples_video/test_video7_resolve.mp4", interactive=True, visible=True)
511
+ gr.Video("examples_video/test_video2_resolve.mp4", interactive=True, visible=True)
512
+ with gr.Column(scale=1):
513
+ gr.Markdown("### Model-Output")
514
+ gr.Video("converted_video/test_video7_resolve_converted.mp4", interactive=True, visible=True)
515
+ gr.Video("converted_video/test_video2_resolve_converted.mp4", interactive=True, visible=True)
516
+
517
+ # Connect components
518
+ upload_btn.upload(
519
+ fn=lambda file_obj: file_obj.name if hasattr(file_obj, 'name') else str(file_obj),
520
+ inputs=[upload_btn],
521
+ outputs=[image_input]
522
+ )
523
+
524
+ image_input.change(
525
+ fn=lambda x: x,
526
+ inputs=[image_input],
527
+ outputs=[current_image]
528
+ )
529
+
530
+ if example_images:
531
+ examples_gallery.select(
532
+ fn=select_from_gallery,
533
+ outputs=[image_input]
534
+ )
535
+
536
+ # Process immediately when image is uploaded
537
+ image_input.change(
538
+ fn=process_image,
539
+ inputs=[image_input, confidence],
540
+ outputs=[image_output]
541
+ )
542
+
543
+ if __name__ == "__main__":
544
+ # Create examples folder if it doesn't exist
545
+ if not os.path.exists(EXAMPLES_FOLDER):
546
+ os.makedirs(EXAMPLES_FOLDER)
547
+ print(f"Created examples folder at {EXAMPLES_FOLDER}")
548
+
549
  demo.launch(share=False)