Spaces:
Runtime error
Runtime error
# importing modules | |
import cv2 | |
import torch | |
from torchvision import transforms | |
from torchvision.models import detection | |
import numpy as np | |
# checks if their is a gpu present, if not uses a cpu | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# mainly consists of the classes present in the coco dataset | |
classes = ['__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', | |
'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign', | |
'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', | |
'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A', | |
'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', | |
'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', | |
'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', | |
'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', | |
'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', | |
'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', | |
'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book', | |
'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'] | |
colors = np.random.uniform(0, 255, size=(len(classes), 3)) # assigning a color to each classes of the data | |
# calling the Faster RCNN ResNet50 model | |
model = detection.fasterrcnn_resnet50_fpn_v2(pretrained=True, progress=True, pretrained_backbone=True).to(device) | |
print(model.eval()) # prints out the architecture of the model | |
# function to carry out object detection on images. | |
def img_detect(img_path): | |
image = cv2.imread(img_path) # reads the model using OpenCV | |
image = cv2.resize(image, (640, 480)) | |
orig = image.copy() | |
# changing the colorspace from BGR to RGB (since Pytorch trains only RGB image) | |
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
image = image.transpose((2, 0, 1)) # swapping the color channels from channels last to channels first | |
image = np.expand_dims(image, axis=0) # add batch dimension to the image | |
image = image / 255.0 # scaling image from (0,255) to (0,1) | |
image = torch.FloatTensor(image) # changes the numpy array to a tensor. | |
image = image.to(device) | |
detections = model(image)[0] # the image is passed to the model to get the bounding boxes | |
people = 0 | |
# loop to construct bounding boxes on image. | |
for i in range(0, len(detections["boxes"])): | |
confidence = detections["scores"][i] # get confidence score of each object in the image | |
idx = int(detections["labels"][i]) # identifying the id of each of the classes in the image | |
box = detections["boxes"][i].detach().cpu().numpy() # gets the coordinates for the bounding boxes | |
(X_1, Y_1, X_2, Y_2) = box.astype("int") | |
if confidence > 0.75 and idx == 1: | |
# matching the label index with its classes and its probability | |
label = f"{classes[idx]}, {idx}: {confidence* 100}%" | |
print(f"[INFO] {label}") | |
people += 1 | |
cv2.rectangle(orig, (X_1, Y_1), (X_2, Y_2), colors[idx], 2) # draw bounding boxes over each object | |
y = Y_1 - 15 if Y_1 - 15 > 15 else Y_1 + 15 | |
# adds the label text to the image. | |
cv2.putText(orig, label, (X_1, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[idx], 2) | |
cv2.putText(orig, f"Number of People: {people}", (5, 19), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[idx], 2) | |
return orig | |
# function to perform object detection in videos | |
def video_detection(video_path): | |
video = cv2.VideoCapture(video_path) | |
# frame_width = video.get(3) | |
# frame_height = video.get(4) | |
# out = cv2.VideoWriter(vid_out, cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 10, (frame_width, frame_height)) | |
while video.isOpened(): | |
ret, frame = video.read() | |
vid = frame.copy() | |
if not ret: | |
break | |
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
frame = transforms.functional.to_tensor(frame) | |
frame = frame.to(device) | |
vid_detect = model([frame])[0] | |
for i in range(0, len(vid_detect["boxes"])): | |
confidence = vid_detect["scores"][i] | |
if confidence > 0.75: | |
idx = int(vid_detect["labels"][i]) | |
box = vid_detect["boxes"][i].detach().cpu().numpy() | |
(X_1, Y_1, X_2, Y_2) = box.astype("int") | |
label = f"{classes[idx]}, {idx}: {confidence* 100}%" | |
print(f"[INFO] {label}") | |
cv2.rectangle(vid, (X_1, Y_1), (X_2, Y_2), colors[idx], 2) | |
y = Y_1 - 15 if Y_1 - 15 > 15 else Y_1 + 15 | |
cv2.putText(vid, label, (X_1, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[idx], 2) | |
return vid | |