Spaces:

IncreasingLoss
/

VoiceView

Running

App Files Files

xet

Community

IncreasingLoss commited on May 4

Commit

ddb2570

verified ·

1 Parent(s): c7730f6

Upload app.py

Browse files

Files changed (1) hide show

app.py +548 -532

app.py CHANGED Viewed

@@ -1,533 +1,549 @@
-"""check torch"""
-import torch
-print(torch.__version__)
-print(torch.version.cuda)
-print(torch.backends.cudnn.enabled)
-"""global variables"""
-user_device = "cuda" if torch.cuda.is_available() else "cpu"
-"""Import libraries"""
-import cv2
-import os
-import glob
-from pathlib import Path
-from collections import Counter
-import gradio as gr
-from PIL import Image
-import numpy as np
-import time
-"""yolo model loading"""
-from ultralytics import YOLO
-# Load model once at startup and keep in memory
-yoloV11_Small = YOLO("yolo_models_downloaded/yolo11s.pt", task="detect").to(user_device).eval()
-# Image preprocessing function
-def preprocess_image(image_path, max_size=640):
-    """
-    Resize image to have longest dimension = max_size while maintaining aspect ratio
-    Returns the resized image and scale factor
-    """
-    try:
-        # Read image
-        img = cv2.imread(image_path)
-        if img is None:
-            raise IOError(f"Could not open image: {image_path}")
-        # Get original dimensions
-        h, w = img.shape[:2]
-        # Calculate scale factor
-        scale = min(max_size / max(h, w), 1.0)  # Only scale down, not up
-        # Skip resizing if image is already smaller than max_size
-        if scale < 1.0:
-            new_h, new_w = int(h * scale), int(w * scale)
-            img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
-        return img, scale
-    except Exception as e:
-        print(f"Error preprocessing image: {e}")
-        return None, 1.0
-""" helper functions"""
-def return_attributes(result, frame_shape):
-    """
-    Returns a Dictionary with Lists:
-    "labels", "left_right"
-    """
-    # Get frame dimensions
-    if frame_shape is not None:
-        try:
-            height, width = frame_shape[:2]  # Only get first two dimensions
-        except (AttributeError, ValueError):
-            height, width = 640, 640
-    else:
-        height, width = 640, 640
-    # Initialize return lists
-    centers = []
-    l_m_r = []
-    array_objects = []
-    try:
-        # Process box coordinates
-        if hasattr(result, 'boxes') and hasattr(result.boxes, 'xywh'):
-            for box in result.boxes.xywh:
-                # Get x coordinate directly as float
-                x = float(box[0].item() if hasattr(box[0], 'item') else box[0])
-                # Determine position more efficiently
-                if x < width * 0.25:
-                    l_m_r.append("left")
-                elif x > width * 0.75:
-                    l_m_r.append("right")
-                else:
-                    l_m_r.append("center")
-        # Process class labels
-        if hasattr(result, 'boxes') and hasattr(result.boxes, 'cls'):
-            for index in result.boxes.cls:
-                # Convert index to integer
-                idx = int(index.item() if hasattr(index, 'item') else index)
-                array_objects.append(str(result.names[idx]))
-    except Exception as e:
-        print(f"Error in return_attributes: {e}")
-        return {"labels": [], "left_right": []}
-    return {"labels": array_objects, "left_right": l_m_r}
-def sort_attributes(attributes):
-    """Sort attributes based on position order"""
-    # Define the desired order
-    position_order = {'center': 0, 'left': 1, 'right': 2}
-    # Combine the labels and positions into pairs
-    combined = list(zip(attributes['left_right'], attributes['labels']))
-    # Sort the combined list based on the defined position order
-    sorted_combined = sorted(combined, key=lambda x: position_order.get(x[0], float('inf')))
-    # Unzip the sorted pairs back into separate lists
-    sorted_positions, sorted_labels = zip(*sorted_combined) if combined else ([], [])
-    # Return the sorted attributes as a new dictionary
-    return {
-        'labels': list(sorted_labels),
-        'left_right': list(sorted_positions)
-    }
-def count_objects(labels, directions):
-    """Group identical labels only when in the same direction"""
-    # Count each (label, direction) pair
-    pair_counts = Counter(zip(labels, directions))
-    # Define direction ranking for sorting
-    rank = {'center': 0, 'left': 1, 'right': 2}
-    # Prepare a helper to pluralize labels when count > 1
-    def pluralize(word, count):
-        if count == 1:
-            return f"a {word}"
-        # simple English plural rules
-        if any(word.endswith(s) for s in ('s','x','z','sh','ch')):
-            return f"{count} {word}es"
-        if word.endswith('y') and word[-2] not in 'aeiou':
-            return f"{count} {word[:-1]}ies"
-        return f"{count} {word}s"
-    # Sort the unique (label, direction) keys by direction rank
-    sorted_pairs = sorted(pair_counts.items(),
-                        key=lambda x: rank.get(x[0][1], float('inf')))
-    # Build the output lists
-    out_labels = []
-    out_dirs = []
-    for (label, direction), cnt in sorted_pairs:
-        out_labels.append(pluralize(label, cnt))
-        out_dirs.append(direction)
-    return {'labels': out_labels, 'left_right': out_dirs}
-def join_items(items):
-    """Join a list of strings into a human-readable list"""
-    if len(items) == 1:
-        return items[0]
-    if len(items) == 2:
-        return f"{items[0]} and {items[1]}"
-    return ", ".join(items[:-1]) + " and " + items[-1]
-def make_sentence(attrib_dict):
-    """Build a sentence from attribute dictionary"""
-    labels = attrib_dict["labels"]
-    directions = attrib_dict["left_right"]
-    # Return early if no objects detected
-    if not labels:
-        return "No objects detected."
-    # Group labels by direction
-    grouped = {"center": [], "left": [], "right": []}
-    for lbl, dr in zip(labels, directions):
-        if dr in grouped:
-            grouped[dr].append(lbl)
-    parts = []
-    has_center_objects = bool(grouped["center"])
-    # Front clause (center)
-    front = grouped["center"]
-    if front:
-        verb = "is" if len(front) == 1 and not front[0].startswith(('2 ', '3 ', '4 ')) else "are"
-        parts.append(f"There {verb} {join_items(front)} in front of you")
-    # Side clauses (left, right)
-    side_parts = []
-    for side in ("left", "right"):
-        items = grouped[side]
-        if items:
-            if len(items) == 1:
-                item = items[0]
-                if item.startswith("a "):
-                    side_parts.append(f"There is {item} to your {side}")
-                else:
-                    verb = "is" if item.startswith(("1 ", "one ")) else "are"
-                    side_parts.append(f"There {verb} {item} to your {side}")
-            else:
-                side_parts.append(f"There are {join_items(items)} to your {side}")
-    # Determine whether to use "also"
-    if side_parts and has_center_objects:
-        # Modify first side part to include "also"
-        if side_parts:
-            first_side = side_parts[0]
-            words = first_side.split(" ", 2)
-            if len(words) >= 2:
-                side_parts[0] = f"{words[0]} also {words[1]} {words[2]}"
-    parts.extend(side_parts)
-    # Join all parts with periods
-    return ". ".join(parts) + "."
-def draw_detection(frame, result, display_labels=True, left_right_lines=True, scale=1.0):
-    """Draw bounding boxes and labels on the frame with proper scaling"""
-    try:
-        # Use numpy arrays directly when possible to avoid unnecessary data transfers
-        if hasattr(result.boxes, 'xyxy') and hasattr(result.boxes.xyxy, 'cpu'):
-            boxes = result.boxes.xyxy.cpu().numpy()
-        else:
-            boxes = result.boxes.xyxy.numpy() if hasattr(result.boxes.xyxy, 'numpy') else result.boxes.xyxy
-        if hasattr(result.boxes, 'cls') and hasattr(result.boxes.cls, 'cpu'):
-            class_ids = result.boxes.cls.cpu().numpy().astype(int)
-        else:
-            class_ids = result.boxes.cls.numpy().astype(int) if hasattr(result.boxes.cls, 'numpy') else result.boxes.cls.astype(int)
-        class_names = result.names
-        # If image was resized, adjust the boxes back to original size
-        # The boxes are in the coordinates of the resized image, so we need to scale them back
-        if scale != 1.0:
-            boxes = boxes / scale
-        for index in range(len(boxes)):
-            box = boxes[index]
-            class_id = class_ids[index]
-            label = str(class_names[class_id])
-            # Ensure box coordinates are integers
-            x1, y1, x2, y2 = map(int, box)
-            # Draw rectangle
-            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
-            if display_labels:
-                # Add a background for text to make it more readable
-                text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.75, 2)[0]
-                cv2.rectangle(frame, (x1, y1-text_size[1]-10), (x1+text_size[0], y1), (0, 0, 0), -1)
-                cv2.putText(frame, label, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 255, 0), 2)
-        if left_right_lines:
-            h, w = frame.shape[:2]
-            cv2.line(frame, (int(w*.25), 0), (int(w*0.25), h), (0, 255, 0), 2)
-            cv2.line(frame, (int(w*.75), 0), (int(w*0.75), h), (0, 255, 0), 2)
-    except Exception as e:
-        print(f"Error drawing detections: {e}")
-        import traceback
-        traceback.print_exc()
-def add_sentence_to_image(frame, sentence):
-    """Add descriptive sentence to bottom of image"""
-    if not sentence:
-        return
-    h, w = frame.shape[:2]
-    # Calculate font size based on image dimensions
-    font_scale = min(w, h) / 1000  # Scale font with image size
-    font_scale = max(0.6, min(5, font_scale))  # Keep between 0.6 and 1.2
-    # Create a semi-transparent background for better text visibility
-    font = cv2.FONT_HERSHEY_SIMPLEX
-    text_size = cv2.getTextSize(sentence, font, font_scale, 2)[0]
-    # Split text into multiple lines if too long
-    max_width = int(w * 0.9)  # Maximum width for text is 90% of image width
-    if text_size[0] > max_width:
-        words = sentence.split()
-        lines = []
-        current_line = []
-        for word in words:
-            test_line = ' '.join(current_line + [word])
-            test_size = cv2.getTextSize(test_line, font, font_scale, 2)[0]
-            if test_size[0] <= max_width:
-                current_line.append(word)
-            else:
-                lines.append(' '.join(current_line))
-                current_line = [word]
-        if current_line:
-            lines.append(' '.join(current_line))
-        # Draw background and text for each line
-        padding = 10
-        line_height = text_size[1] + padding
-        bg_height = (line_height * len(lines)) + padding * 2
-        # Create semi-transparent overlay for text background
-        overlay = frame.copy()
-        bg_y1 = h - bg_height
-        cv2.rectangle(overlay, (0, bg_y1), (w, h), (0, 0, 0), -1)
-        cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
-        # Draw each line of text
-        for i, line in enumerate(lines):
-            y_pos = bg_y1 + padding + (i + 1) * line_height - padding//2
-            cv2.putText(frame, line, (padding, y_pos), font, font_scale, (255, 255, 255), 2)
-    else:
-        # For short sentences, just add a single line of text
-        padding = 10
-        # Create semi-transparent overlay for text background
-        overlay = frame.copy()
-        bg_y1 = h - text_size[1] - padding * 3
-        cv2.rectangle(overlay, (0, bg_y1), (w, h), (0, 0, 0), -1)
-        cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
-        # Position text in bottom right with padding
-        cv2.putText(frame, sentence, (padding, h - padding), font, font_scale, (255, 255, 255), 2)
-"""process audio / video"""
-def process_image(image_path, confidence=0.5):
-    """
-    Process a single image file with YOLO detection
-    Adds a descriptive sentence at the bottom of the image
-    Returns the processed image
-    """
-    if not image_path:
-        return None
-    try:
-        start_time = time.time()
-        # Store original image for drawing results
-        original_img = cv2.imread(image_path)
-        if original_img is None:
-            return None
-        # Preprocess image - resize to max 640px on longest side for YOLO processing
-        processed_img, scale_factor = preprocess_image(image_path, max_size=640)
-        if processed_img is None:
-            return None
-        # Run YOLO inference with optimized settings on the RESIZED image
-        results = yoloV11_Small(
-            processed_img,
-            conf=confidence,
-            verbose=False,
-            iou=0.5,
-            agnostic_nms=True,
-            device=user_device
-        )
-        # Create a copy of the ORIGINAL image for drawing
-        output_frame = original_img.copy()
-        # Process the detections
-        current_sentence = "No objects detected."
-        for result in results:
-            # Get original frame shape for attribute calculation
-            frame_shape = processed_img.shape  # Use the processed image shape for attribute calculation
-            # Draw detections on the ORIGINAL frame with proper scaling
-            draw_detection(output_frame, result, display_labels=True, left_right_lines=True, scale=scale_factor)
-            # Process attributes based on the PROCESSED image (as that's what YOLO saw)
-            attributes = return_attributes(result, frame_shape)
-            if attributes["labels"]:
-                attrs = sort_attributes(attributes)
-                counted = count_objects(attrs["labels"], attrs["left_right"])
-                current_sentence = make_sentence(counted)
-        # Add the sentence to the image
-        add_sentence_to_image(output_frame, current_sentence)
-        # Convert from BGR to RGB for displaying in Gradio
-        output_frame_rgb = cv2.cvtColor(output_frame, cv2.COLOR_BGR2RGB)
-        end_time = time.time()
-        print(f"Processing time: {end_time - start_time:.3f} seconds")
-        return output_frame_rgb
-    except Exception as e:
-        print(f"Image processing error: {e}")
-        import traceback
-        traceback.print_exc()
-        return None
-"""gradio app for image processing"""
-EXAMPLES_FOLDER = "examples"
-TEMP_FILES_FOLDER = Path("temp_files")
-# Ensure temp folder exists
-if not os.path.exists(TEMP_FILES_FOLDER):
-    os.makedirs(TEMP_FILES_FOLDER)
-# Function to load example images from a folder
-def load_examples_from_folder(folder_path=EXAMPLES_FOLDER):
-    """Load all images from a specified folder as examples"""
-    if not os.path.exists(folder_path):
-        os.makedirs(folder_path)
-        print(f"Created examples folder at {folder_path}")
-        return []
-    # Valid file extensions for images only
-    image_extensions = [".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"]
-    # Get all image files in the folder efficiently
-    example_files = []
-    for ext in image_extensions:
-        example_files.extend(glob.glob(os.path.join(folder_path, f"*{ext}")))
-        example_files.extend(glob.glob(os.path.join(folder_path, f"*{ext.upper()}")))
-    print(f"Loaded {len(example_files)} example images from {folder_path}")
-    return example_files
-# Load example images once at startup
-raw = load_examples_from_folder()
-unique = list(dict.fromkeys(raw))              # remove duplicates
-example_images = [(p, None) for p in unique]   # tell Gallery about each image
-def select_from_gallery(evt: gr.SelectData):
-    """Handle selection from the gallery"""
-    selected_path = example_images[evt.index][0]
-    return selected_path
-custom_css = """
-/* Allow auto height for content areas */
-.fixed-height.svelte-842rpi.svelte-842rpi {
-  min-height: 0        !important;
-  max-height: none     !important;
-  height: auto         !important;
-}
-/* Maintain responsive behavior */
-@media (min-width: 1280px) {
-  .fixed-height.svelte-842rpi.svelte-842rpi {
-    min-height: 0      !important;
-    max-height: none   !important;
-    height: auto       !important;
-  }
-}
-"""
-# Create the Gradio interface
-with gr.Blocks(title="Object Detection and Description", css=custom_css) as demo:
-    gr.Markdown("## 🔍 Object Detection with Scene Description")
-    gr.Markdown("Upload or select an image to detect objects and get a descriptive sentence.")
-    # Store current image path
-    current_image = gr.State()
-    with gr.Row():
-        with gr.Column(scale=1):
-            # Image input component
-            image_input = gr.Image(label="Upload Image", type="filepath")
-            # Controls
-            upload_btn = gr.UploadButton(
-                "Upload Image",
-                file_types=["image"]
-            )
-            confidence = gr.Slider(
-                minimum=0.1,
-                maximum=0.9,
-                value=0.5,
-                step=0.025,
-                label="Detection Confidence"
-            )
-        with gr.Column(scale=1):
-            # Output component
-            image_output = gr.Image(label="Scene Analysis")
-    # Example gallery at bottom
-    with gr.Row(variant="panel"):
-        if example_images:
-            examples_gallery = gr.Gallery(
-                value=example_images,
-                label=f"Example Images (Click to Select) - {len(example_images)} examples from {EXAMPLES_FOLDER}",
-                columns=6,
-                elem_id="image_gallery",
-                allow_preview=False,
-                elem_classes=["centered-examples"]
-            )
-        else:
-            gr.Markdown(
-                f"No example images found in {EXAMPLES_FOLDER} folder. Add image files to see examples."
-            )
-    # Connect components
-    upload_btn.upload(
-        fn=lambda file_obj: file_obj.name if hasattr(file_obj, 'name') else str(file_obj),
-        inputs=[upload_btn],
-        outputs=[image_input]
-    )
-    image_input.change(
-        fn=lambda x: x,
-        inputs=[image_input],
-        outputs=[current_image]
-    )
-    if example_images:
-        examples_gallery.select(
-            fn=select_from_gallery,
-            outputs=[image_input]
-        )
-    # Process immediately when image is uploaded
-    image_input.change(
-        fn=process_image,
-        inputs=[image_input, confidence],
-        outputs=[image_output]
-    )
-if __name__ == "__main__":
-    # Create examples folder if it doesn't exist
-    if not os.path.exists(EXAMPLES_FOLDER):
-        os.makedirs(EXAMPLES_FOLDER)
-        print(f"Created examples folder at {EXAMPLES_FOLDER}")
     demo.launch(share=False)

+"""check torch"""
+import torch
+print(torch.__version__)
+print(torch.version.cuda)
+print(torch.backends.cudnn.enabled)
+"""global variables"""
+user_device = "cuda" if torch.cuda.is_available() else "cpu"
+"""Import libraries"""
+import cv2
+import os
+import glob
+from pathlib import Path
+from collections import Counter
+import gradio as gr
+from PIL import Image
+import numpy as np
+import time
+"""yolo model loading"""
+from ultralytics import YOLO
+# Load model once at startup and keep in memory
+yoloV11_Small = YOLO("yolo_models_downloaded/yolo11s.pt", task="detect").to(user_device).eval()
+# Image preprocessing function
+def preprocess_image(image_path, max_size=640):
+    """
+    Resize image to have longest dimension = max_size while maintaining aspect ratio
+    Returns the resized image and scale factor
+    """
+    try:
+        # Read image
+        img = cv2.imread(image_path)
+        if img is None:
+            raise IOError(f"Could not open image: {image_path}")
+        # Get original dimensions
+        h, w = img.shape[:2]
+        # Calculate scale factor
+        scale = min(max_size / max(h, w), 1.0)  # Only scale down, not up
+        # Skip resizing if image is already smaller than max_size
+        if scale < 1.0:
+            new_h, new_w = int(h * scale), int(w * scale)
+            img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
+        return img, scale
+    except Exception as e:
+        print(f"Error preprocessing image: {e}")
+        return None, 1.0
+""" helper functions"""
+def return_attributes(result, frame_shape):
+    """
+    Returns a Dictionary with Lists:
+    "labels", "left_right"
+    """
+    # Get frame dimensions
+    if frame_shape is not None:
+        try:
+            height, width = frame_shape[:2]  # Only get first two dimensions
+        except (AttributeError, ValueError):
+            height, width = 640, 640
+    else:
+        height, width = 640, 640
+    # Initialize return lists
+    centers = []
+    l_m_r = []
+    array_objects = []
+    try:
+        # Process box coordinates
+        if hasattr(result, 'boxes') and hasattr(result.boxes, 'xywh'):
+            for box in result.boxes.xywh:
+                # Get x coordinate directly as float
+                x = float(box[0].item() if hasattr(box[0], 'item') else box[0])
+                # Determine position more efficiently
+                if x < width * 0.25:
+                    l_m_r.append("left")
+                elif x > width * 0.75:
+                    l_m_r.append("right")
+                else:
+                    l_m_r.append("center")
+        # Process class labels
+        if hasattr(result, 'boxes') and hasattr(result.boxes, 'cls'):
+            for index in result.boxes.cls:
+                # Convert index to integer
+                idx = int(index.item() if hasattr(index, 'item') else index)
+                array_objects.append(str(result.names[idx]))
+    except Exception as e:
+        print(f"Error in return_attributes: {e}")
+        return {"labels": [], "left_right": []}
+    return {"labels": array_objects, "left_right": l_m_r}
+def sort_attributes(attributes):
+    """Sort attributes based on position order"""
+    # Define the desired order
+    position_order = {'center': 0, 'left': 1, 'right': 2}
+    # Combine the labels and positions into pairs
+    combined = list(zip(attributes['left_right'], attributes['labels']))
+    # Sort the combined list based on the defined position order
+    sorted_combined = sorted(combined, key=lambda x: position_order.get(x[0], float('inf')))
+    # Unzip the sorted pairs back into separate lists
+    sorted_positions, sorted_labels = zip(*sorted_combined) if combined else ([], [])
+    # Return the sorted attributes as a new dictionary
+    return {
+        'labels': list(sorted_labels),
+        'left_right': list(sorted_positions)
+    }
+def count_objects(labels, directions):
+    """Group identical labels only when in the same direction"""
+    # Count each (label, direction) pair
+    pair_counts = Counter(zip(labels, directions))
+    # Define direction ranking for sorting
+    rank = {'center': 0, 'left': 1, 'right': 2}
+    # Prepare a helper to pluralize labels when count > 1
+    def pluralize(word, count):
+        if count == 1:
+            return f"a {word}"
+        # simple English plural rules
+        if any(word.endswith(s) for s in ('s','x','z','sh','ch')):
+            return f"{count} {word}es"
+        if word.endswith('y') and word[-2] not in 'aeiou':
+            return f"{count} {word[:-1]}ies"
+        return f"{count} {word}s"
+    # Sort the unique (label, direction) keys by direction rank
+    sorted_pairs = sorted(pair_counts.items(),
+                        key=lambda x: rank.get(x[0][1], float('inf')))
+    # Build the output lists
+    out_labels = []
+    out_dirs = []
+    for (label, direction), cnt in sorted_pairs:
+        out_labels.append(pluralize(label, cnt))
+        out_dirs.append(direction)
+    return {'labels': out_labels, 'left_right': out_dirs}
+def join_items(items):
+    """Join a list of strings into a human-readable list"""
+    if len(items) == 1:
+        return items[0]
+    if len(items) == 2:
+        return f"{items[0]} and {items[1]}"
+    return ", ".join(items[:-1]) + " and " + items[-1]
+def make_sentence(attrib_dict):
+    """Build a sentence from attribute dictionary"""
+    labels = attrib_dict["labels"]
+    directions = attrib_dict["left_right"]
+    # Return early if no objects detected
+    if not labels:
+        return "No objects detected."
+    # Group labels by direction
+    grouped = {"center": [], "left": [], "right": []}
+    for lbl, dr in zip(labels, directions):
+        if dr in grouped:
+            grouped[dr].append(lbl)
+    parts = []
+    has_center_objects = bool(grouped["center"])
+    # Front clause (center)
+    front = grouped["center"]
+    if front:
+        verb = "is" if len(front) == 1 and not front[0].startswith(('2 ', '3 ', '4 ')) else "are"
+        parts.append(f"There {verb} {join_items(front)} in front of you")
+    # Side clauses (left, right)
+    side_parts = []
+    for side in ("left", "right"):
+        items = grouped[side]
+        if items:
+            if len(items) == 1:
+                item = items[0]
+                if item.startswith("a "):
+                    side_parts.append(f"There is {item} to your {side}")
+                else:
+                    verb = "is" if item.startswith(("1 ", "one ")) else "are"
+                    side_parts.append(f"There {verb} {item} to your {side}")
+            else:
+                side_parts.append(f"There are {join_items(items)} to your {side}")
+    # Determine whether to use "also"
+    if side_parts and has_center_objects:
+        # Modify first side part to include "also"
+        if side_parts:
+            first_side = side_parts[0]
+            words = first_side.split(" ", 2)
+            if len(words) >= 2:
+                side_parts[0] = f"{words[0]} also {words[1]} {words[2]}"
+    parts.extend(side_parts)
+    # Join all parts with periods
+    return ". ".join(parts) + "."
+def draw_detection(frame, result, display_labels=True, left_right_lines=True, scale=1.0):
+    """Draw bounding boxes and labels on the frame with proper scaling"""
+    try:
+        # Use numpy arrays directly when possible to avoid unnecessary data transfers
+        if hasattr(result.boxes, 'xyxy') and hasattr(result.boxes.xyxy, 'cpu'):
+            boxes = result.boxes.xyxy.cpu().numpy()
+        else:
+            boxes = result.boxes.xyxy.numpy() if hasattr(result.boxes.xyxy, 'numpy') else result.boxes.xyxy
+        if hasattr(result.boxes, 'cls') and hasattr(result.boxes.cls, 'cpu'):
+            class_ids = result.boxes.cls.cpu().numpy().astype(int)
+        else:
+            class_ids = result.boxes.cls.numpy().astype(int) if hasattr(result.boxes.cls, 'numpy') else result.boxes.cls.astype(int)
+        class_names = result.names
+        # If image was resized, adjust the boxes back to original size
+        # The boxes are in the coordinates of the resized image, so we need to scale them back
+        if scale != 1.0:
+            boxes = boxes / scale
+        for index in range(len(boxes)):
+            box = boxes[index]
+            class_id = class_ids[index]
+            label = str(class_names[class_id])
+            # Ensure box coordinates are integers
+            x1, y1, x2, y2 = map(int, box)
+            # Draw rectangle
+            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
+            if display_labels:
+                # Add a background for text to make it more readable
+                text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.75, 2)[0]
+                cv2.rectangle(frame, (x1, y1-text_size[1]-10), (x1+text_size[0], y1), (0, 0, 0), -1)
+                cv2.putText(frame, label, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 255, 0), 2)
+        if left_right_lines:
+            h, w = frame.shape[:2]
+            cv2.line(frame, (int(w*.25), 0), (int(w*0.25), h), (0, 255, 0), 2)
+            cv2.line(frame, (int(w*.75), 0), (int(w*0.75), h), (0, 255, 0), 2)
+    except Exception as e:
+        print(f"Error drawing detections: {e}")
+        import traceback
+        traceback.print_exc()
+def add_sentence_to_image(frame, sentence):
+    """Add descriptive sentence to bottom of image"""
+    if not sentence:
+        return
+    h, w = frame.shape[:2]
+    # Calculate font size based on image dimensions
+    font_scale = min(w, h) / 1000  # Scale font with image size
+    font_scale = max(0.6, min(5, font_scale))  # Keep between 0.6 and 1.2
+    # Create a semi-transparent background for better text visibility
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    text_size = cv2.getTextSize(sentence, font, font_scale, 2)[0]
+    # Split text into multiple lines if too long
+    max_width = int(w * 0.9)  # Maximum width for text is 90% of image width
+    if text_size[0] > max_width:
+        words = sentence.split()
+        lines = []
+        current_line = []
+        for word in words:
+            test_line = ' '.join(current_line + [word])
+            test_size = cv2.getTextSize(test_line, font, font_scale, 2)[0]
+            if test_size[0] <= max_width:
+                current_line.append(word)
+            else:
+                lines.append(' '.join(current_line))
+                current_line = [word]
+        if current_line:
+            lines.append(' '.join(current_line))
+        # Draw background and text for each line
+        padding = 10
+        line_height = text_size[1] + padding
+        bg_height = (line_height * len(lines)) + padding * 2
+        # Create semi-transparent overlay for text background
+        overlay = frame.copy()
+        bg_y1 = h - bg_height
+        cv2.rectangle(overlay, (0, bg_y1), (w, h), (0, 0, 0), -1)
+        cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
+        # Draw each line of text
+        for i, line in enumerate(lines):
+            y_pos = bg_y1 + padding + (i + 1) * line_height - padding//2
+            cv2.putText(frame, line, (padding, y_pos), font, font_scale, (255, 255, 255), 2)
+    else:
+        # For short sentences, just add a single line of text
+        padding = 10
+        # Create semi-transparent overlay for text background
+        overlay = frame.copy()
+        bg_y1 = h - text_size[1] - padding * 3
+        cv2.rectangle(overlay, (0, bg_y1), (w, h), (0, 0, 0), -1)
+        cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
+        # Position text in bottom right with padding
+        cv2.putText(frame, sentence, (padding, h - padding), font, font_scale, (255, 255, 255), 2)
+"""process audio / video"""
+def process_image(image_path, confidence=0.5):
+    """
+    Process a single image file with YOLO detection
+    Adds a descriptive sentence at the bottom of the image
+    Returns the processed image
+    """
+    if not image_path:
+        return None
+    try:
+        start_time = time.time()
+        # Store original image for drawing results
+        original_img = cv2.imread(image_path)
+        if original_img is None:
+            return None
+        # Preprocess image - resize to max 640px on longest side for YOLO processing
+        processed_img, scale_factor = preprocess_image(image_path, max_size=640)
+        if processed_img is None:
+            return None
+        # Run YOLO inference with optimized settings on the RESIZED image
+        results = yoloV11_Small(
+            processed_img,
+            conf=confidence,
+            verbose=False,
+            iou=0.5,
+            agnostic_nms=True,
+            device=user_device
+        )
+        # Create a copy of the ORIGINAL image for drawing
+        output_frame = original_img.copy()
+        # Process the detections
+        current_sentence = "No objects detected."
+        for result in results:
+            # Get original frame shape for attribute calculation
+            frame_shape = processed_img.shape  # Use the processed image shape for attribute calculation
+            # Draw detections on the ORIGINAL frame with proper scaling
+            draw_detection(output_frame, result, display_labels=True, left_right_lines=True, scale=scale_factor)
+            # Process attributes based on the PROCESSED image (as that's what YOLO saw)
+            attributes = return_attributes(result, frame_shape)
+            if attributes["labels"]:
+                attrs = sort_attributes(attributes)
+                counted = count_objects(attrs["labels"], attrs["left_right"])
+                current_sentence = make_sentence(counted)
+        # Add the sentence to the image
+        add_sentence_to_image(output_frame, current_sentence)
+        # Convert from BGR to RGB for displaying in Gradio
+        output_frame_rgb = cv2.cvtColor(output_frame, cv2.COLOR_BGR2RGB)
+        end_time = time.time()
+        print(f"Processing time: {end_time - start_time:.3f} seconds")
+        return output_frame_rgb
+    except Exception as e:
+        print(f"Image processing error: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+"""gradio app for image processing"""
+EXAMPLES_FOLDER = "examples"
+TEMP_FILES_FOLDER = Path("temp_files")
+# Ensure temp folder exists
+if not os.path.exists(TEMP_FILES_FOLDER):
+    os.makedirs(TEMP_FILES_FOLDER)
+# Function to load example images from a folder
+def load_examples_from_folder(folder_path=EXAMPLES_FOLDER):
+    """Load all images from a specified folder as examples"""
+    if not os.path.exists(folder_path):
+        os.makedirs(folder_path)
+        print(f"Created examples folder at {folder_path}")
+        return []
+    # Valid file extensions for images only
+    image_extensions = [".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"]
+    # Get all image files in the folder efficiently
+    example_files = []
+    for ext in image_extensions:
+        example_files.extend(glob.glob(os.path.join(folder_path, f"*{ext}")))
+        example_files.extend(glob.glob(os.path.join(folder_path, f"*{ext.upper()}")))
+    print(f"Loaded {len(example_files)} example images from {folder_path}")
+    return example_files
+# Load example images once at startup
+raw = load_examples_from_folder()
+unique = list(dict.fromkeys(raw))              # remove duplicates
+example_images = [(p, None) for p in unique]   # tell Gallery about each image
+def select_from_gallery(evt: gr.SelectData):
+    """Handle selection from the gallery"""
+    selected_path = example_images[evt.index][0]
+    return selected_path
+custom_css = """
+/* Allow auto height for content areas */
+.fixed-height.svelte-842rpi.svelte-842rpi {
+  min-height: 0        !important;
+  max-height: none     !important;
+  height: auto         !important;
+}
+/* Maintain responsive behavior */
+@media (min-width: 1280px) {
+  .fixed-height.svelte-842rpi.svelte-842rpi {
+    min-height: 0      !important;
+    max-height: none   !important;
+    height: auto       !important;
+  }
+}
+"""
+# Create the Gradio interface
+with gr.Blocks(title="VoiceView: Object Detection and Description", css=custom_css) as demo:
+    gr.Markdown("## 🔍 VoiceView: Object Detection with Scene Description")
+    gr.Markdown("Upload or select an image to detect objects and get a descriptive sentence.")
+    # Store current image path
+    current_image = gr.State()
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Image input component
+            image_input = gr.Image(label="Upload Image", type="filepath")
+            # Controls
+            upload_btn = gr.UploadButton(
+                "Upload Image",
+                file_types=["image"]
+            )
+            confidence = gr.Slider(
+                minimum=0.1,
+                maximum=0.9,
+                value=0.5,
+                step=0.025,
+                label="Detection Confidence"
+            )
+        with gr.Column(scale=1):
+            # Output component
+            image_output = gr.Image(label="Scene Analysis")
+    # Example gallery at bottom
+    with gr.Row(variant="panel"):
+        if example_images:
+            examples_gallery = gr.Gallery(
+                value=example_images,
+                label=f"Example Images (Click to Select) - {len(example_images)} examples from {EXAMPLES_FOLDER}",
+                columns=6,
+                elem_id="image_gallery",
+                allow_preview=False,
+                elem_classes=["centered-examples"]
+            )
+        else:
+            gr.Markdown(
+                f"No example images found in {EXAMPLES_FOLDER} folder. Add image files to see examples."
+            )
+    with gr.Column(variant="panel", scale=1):
+        gr.Markdown("## Video Version")
+        gr.Markdown("#### Unfortunately, this only runs locally due to latency when uploading images to Spaces.")
+        gr.Markdown("#### Test it yourself: download the `examples_video` folder and `app_local.py`.")
+        gr.Markdown("#### Don’t forget to install the required dependencies! :D")
+    with gr.Row(variant="panel"):
+        with gr.Column(scale=1):
+            gr.Markdown("### Input-Videos")
+            gr.Video("examples_video/test_video7_resolve.mp4", interactive=True, visible=True)
+            gr.Video("examples_video/test_video2_resolve.mp4", interactive=True, visible=True)
+        with gr.Column(scale=1):
+            gr.Markdown("### Model-Output")
+            gr.Video("converted_video/test_video7_resolve_converted.mp4", interactive=True, visible=True)
+            gr.Video("converted_video/test_video2_resolve_converted.mp4", interactive=True, visible=True)
+    # Connect components
+    upload_btn.upload(
+        fn=lambda file_obj: file_obj.name if hasattr(file_obj, 'name') else str(file_obj),
+        inputs=[upload_btn],
+        outputs=[image_input]
+    )
+    image_input.change(
+        fn=lambda x: x,
+        inputs=[image_input],
+        outputs=[current_image]
+    )
+    if example_images:
+        examples_gallery.select(
+            fn=select_from_gallery,
+            outputs=[image_input]
+        )
+    # Process immediately when image is uploaded
+    image_input.change(
+        fn=process_image,
+        inputs=[image_input, confidence],
+        outputs=[image_output]
+    )
+if __name__ == "__main__":
+    # Create examples folder if it doesn't exist
+    if not os.path.exists(EXAMPLES_FOLDER):
+        os.makedirs(EXAMPLES_FOLDER)
+        print(f"Created examples folder at {EXAMPLES_FOLDER}")
     demo.launch(share=False)