Spaces:

macadeliccc
/

owlv2-base-patch-16-ensemble-demo

Runtime error

File size: 5,886 Bytes

import gradio as gr
from pydantic import BaseModel
from typing import List, Optional
from PIL import Image, ImageDraw, ImageFont
import random
import torch
from transformers import Owlv2Processor, Owlv2ForObjectDetection
import logging
from logging.handlers import RotatingFileHandler
import base64
import io
import os
import numpy as np

class DetectionRequest(BaseModel):
    image_data: str
    texts: List[List[str]]

class DetectionResult(BaseModel):
    detections: List[str]
    image_with_boxes: str

processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
# Create logs directory if it doesn't exist
if not os.path.exists('logs'):
    os.makedirs('logs')

def draw_bounding_boxes(image: Image, boxes, scores, labels, text_labels):
    draw = ImageDraw.Draw(image)
    width, height = image.size

    # Define the color bank
    color_bank = ["#0AC2FF", "#47FF0A", "#FF0AC2", "#ADD8E6", "#FF0A47"]

    # Use default font
    font = ImageFont.load_default()

    for box, score, label in zip(boxes, scores, labels):
        # Choose a random color
        color = random.choice(color_bank)

        # Convert the box to a Python list if it's not already
        if isinstance(box, torch.Tensor):
            box = box.tolist()
        elif not isinstance(box, (list, tuple)):
            raise TypeError("Box must be a list or tuple of coordinates.")

        # Draw the rectangle
        draw.rectangle(box, outline=color, width=2)

        # Get the text to display
        display_text = f"{text_labels[label]}: {score:.2f}"

        # Calculate position for the text
        text_position = (box[0], box[1] - 10)

        # Draw the text
        draw.text(text_position, display_text, fill=color, font=font)

    return image

def detect_objects_logic(image_data, texts):
    try:
        # Decode the base64 image
        image_data_bytes = base64.b64decode(image_data)
        image = Image.open(io.BytesIO(image_data_bytes))
        width, height = image.size

        inputs = processor(text=texts, images=image, return_tensors="pt")
        outputs = model(**inputs)

        target_sizes = torch.Tensor([image.size[::-1]])
        results = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)

        detection_strings = []
        image_with_boxes = image.copy()  # Copy the image only once
        
        for i, text_group in enumerate(texts):
            if i >= len(results):
                logging.error(f"Text group index {i} exceeds results length.")
                continue
            logging.info(f"Processing texts: {texts}")
            results_per_group = results[i]
            boxes = results_per_group["boxes"]
            scores = results_per_group["scores"]
            labels = results_per_group["labels"]

            image_with_boxes = draw_bounding_boxes(image_with_boxes, boxes, scores, labels, text_group)

            for box, score, label in zip(boxes, scores, labels):
                scaled_box = [round(box[i].item() * (width if i % 2 == 0 else height), 2) for i in range(len(box))]
                detection_string = f"Detected {text_group[label]} with confidence {round(score.item(), 3)} at location {scaled_box}"
                detection_strings.append(detection_string)

        logging.info("Bounding boxes and labels have been drawn on the image.")

        return image_with_boxes, detection_strings

    except IndexError as e:
        logging.error(f"Index error: {e}. Check if the number of text groups matches the model's output.")
        raise e
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}", exc_info=True)
        raise e

def gradio_detect_and_draw(image, text_labels):
    # Check if the image is None
    if image is None:
        raise ValueError("No image was provided.")

    # Convert the input image to PIL Image if it's a numpy array
    if isinstance(image, np.ndarray):
        image = Image.fromarray(image.astype('uint8'), 'RGB')

    # Convert PIL Image to base64 for your logic function
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    image_data = base64.b64encode(buffered.getvalue()).decode("utf-8")

    # Process texts input
    text_labels = [text_labels.split(',')] if text_labels else []

    # Call your detection logic
    processed_image, detections = detect_objects_logic(image_data, text_labels)

    # Convert the output image to PIL Image if it's a numpy array
    if isinstance(processed_image, np.ndarray):
        processed_image = Image.fromarray(processed_image.astype('uint8'), 'RGB')

    return processed_image, detections


with gr.Blocks(gr.themes.Soft()) as demo:
    gr.Markdown("## Owlv2 Object Detection Demo")
    gr.Markdown('Run this space on your own hardware with this command: ```docker run -it -p 7860:7860 --platform=linux/amd64 \
	registry.hf.space/macadeliccc-owlv2-base-patch-16-ensemble-demo:latest python app.py```')
    
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload or draw an image")
            text_input = gr.Textbox(label="Enter comma-separated labels for detection")
            submit_button = gr.Button("Detect")
        with gr.Column():
            image_output = gr.Image(label="Processed Image")
            text_output = gr.Text(label="Detections")
    

    submit_button.click(
        gradio_detect_and_draw, 
        inputs=[image_input, text_input], 
        outputs=[image_output, text_output]
    )
   # Add examples
    examples = [
        ["assets/snowman.jpg", "snowman"],
        ["assets/traffic.jpg", "taxi,traffic light"],
        ["assets/umbrellas.jpg", "umbrella"]
    ]
    gr.Examples(examples, inputs=[image_input, text_input])


demo.launch()