ObjectDetection / app.py
RakanAlsheraiwi's picture
Update app.py
8e8975e verified
raw
history blame
4.28 kB
import cv2
import torch
from PIL import Image, ImageDraw
import gradio as gr
import numpy as np
import pandas as pd
from transformers import pipeline
# Load the YOLOv5 model
# Use a local clone of YOLOv5
yolo_repo = 'ultralytics/yolov5'
model = torch.hub.load(yolo_repo, 'yolov5s', source='github')
# Load the translation model
translator = pipeline("translation_en_to_ar", model="Helsinki-NLP/opus-mt-en-ar")
# Define a function to detect objects and draw bounding boxes for images
def detect_and_draw_image(input_image):
results = model(input_image)
detections = results.xyxy[0].numpy()
draw = ImageDraw.Draw(input_image)
counts = {}
for detection in detections:
xmin, ymin, xmax, ymax, conf, class_id = detection
# Update counts for each label
label = model.names[int(class_id)]
counts[label] = counts.get(label, 0) + 1
# Draw the bounding box
draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=2)
draw.text((xmin, ymin), f"{label}: {conf:.2f}", fill="white")
# Translate counts to Arabic
translated_counts = translator(list(counts.keys()))
df = pd.DataFrame({
'label (English)': list(counts.keys()),
'label (Arabic)': [t['translation_text'] for t in translated_counts],
'counts': list(counts.values())
})
return input_image, df
# Define a function to detect objects and draw bounding boxes for videos
def detect_and_draw_video(video_path):
cap = cv2.VideoCapture(video_path)
frames = []
frame_shape = None
overall_counts = {}
detected_objects = set() # Set to keep track of unique detections
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
frame = cv2.resize(frame, (640, 480))
results = model(frame)
detections = results.xyxy[0].numpy()
for detection in detections:
xmin, ymin, xmax, ymax, conf, class_id = detection
# Create a unique identifier for the object based on its bounding box
identifier = (model.names[int(class_id)], int((xmin + xmax) / 2), int((ymin + ymax) / 2))
# Count the object only if it hasn't been detected before
if identifier not in detected_objects:
detected_objects.add(identifier)
label = model.names[int(class_id)]
overall_counts[label] = overall_counts.get(label, 0) + 1
cv2.rectangle(frame, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (255, 0, 0), 2)
cv2.putText(frame, f"{model.names[int(class_id)]}: {conf:.2f}", (int(xmin), int(ymin) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 255, 255), 2)
frames.append(frame)
cap.release()
if frame_shape is None:
return None, None
output_path = 'output.mp4'
out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), 20.0, (640, 480))
for frame in frames:
out.write(frame)
out.release()
# Translate counts to Arabic
translated_counts = translator(list(overall_counts.keys()))
df = pd.DataFrame({
'label (English)': list(overall_counts.keys()),
'label (Arabic)': [t['translation_text'] for t in translated_counts],
'counts': list(overall_counts.values())
})
return output_path, df
# Create separate interfaces for images and videos
image_interface = gr.Interface(
fn=detect_and_draw_image,
inputs=gr.Image(type="pil", label="Upload Image"),
outputs=[gr.Image(type="pil"), gr.Dataframe(label="Object Counts")],
title="Object Detection for Images",
description="Upload an image to see the objects detected by YOLOv5 with bounding boxes and their counts."
)
video_interface = gr.Interface(
fn=detect_and_draw_video,
inputs=gr.Video(label="Upload Video"),
outputs=[gr.Video(label="Processed Video"), gr.Dataframe(label="Object Counts")],
title="Object Detection for Videos",
description="Upload a video to see the objects detected by YOLOv5 with bounding boxes and their counts."
)
# Combine interfaces into a single app
app = gr.TabbedInterface([image_interface, video_interface], ["Image Detection", "Video Detection"])
# Launch the app
app.launch(debug=True)