# importing modules
import cv2
import torch
from torchvision import transforms
from torchvision.models import detection
import numpy as np
# checks if their is a gpu present, if not uses a cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# mainly consists of the classes present in the coco dataset
classes = ['__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
colors = np.random.uniform(0, 255, size=(len(classes), 3)) # assigning a color to each classes of the data
# calling the Faster RCNN ResNet50 model
model = detection.fasterrcnn_resnet50_fpn_v2(pretrained=True, progress=True, pretrained_backbone=True).to(device)
print(model.eval()) # prints out the architecture of the model
# function to carry out object detection on images.
def img_detect(img_path):
image = cv2.imread(img_path) # reads the model using OpenCV
image = cv2.resize(image, (640, 480))
orig = image.copy()
# changing the colorspace from BGR to RGB (since Pytorch trains only RGB image)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = image.transpose((2, 0, 1)) # swapping the color channels from channels last to channels first
image = np.expand_dims(image, axis=0) # add batch dimension to the image
image = image / 255.0 # scaling image from (0,255) to (0,1)
image = torch.FloatTensor(image) # changes the numpy array to a tensor.
image =
detections = model(image)[0] # the image is passed to the model to get the bounding boxes
people = 0
# loop to construct bounding boxes on image.
for i in range(0, len(detections["boxes"])):
confidence = detections["scores"][i] # get confidence score of each object in the image
idx = int(detections["labels"][i]) # identifying the id of each of the classes in the image
box = detections["boxes"][i].detach().cpu().numpy() # gets the coordinates for the bounding boxes
(X_1, Y_1, X_2, Y_2) = box.astype("int")
if confidence > 0.75 and idx == 1:
# matching the label index with its classes and its probability
label = f"{classes[idx]}, {idx}: {confidence* 100}%"
print(f"[INFO] {label}")
people += 1
cv2.rectangle(orig, (X_1, Y_1), (X_2, Y_2), colors[idx], 2) # draw bounding boxes over each object
y = Y_1 - 15 if Y_1 - 15 > 15 else Y_1 + 15
# adds the label text to the image.
cv2.putText(orig, label, (X_1, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[idx], 2)
cv2.putText(orig, f"Number of People: {people}", (5, 19), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[idx], 2)
return orig
# function to perform object detection in videos
def video_detection(video_path):
video = cv2.VideoCapture(video_path)
# frame_width = video.get(3)
# frame_height = video.get(4)
# out = cv2.VideoWriter(vid_out, cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 10, (frame_width, frame_height))
while video.isOpened():
ret, frame =
vid = frame.copy()
if not ret:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = transforms.functional.to_tensor(frame)
frame =
vid_detect = model([frame])[0]
for i in range(0, len(vid_detect["boxes"])):
confidence = vid_detect["scores"][i]
if confidence > 0.75:
idx = int(vid_detect["labels"][i])
box = vid_detect["boxes"][i].detach().cpu().numpy()
(X_1, Y_1, X_2, Y_2) = box.astype("int")
label = f"{classes[idx]}, {idx}: {confidence* 100}%"
print(f"[INFO] {label}")
cv2.rectangle(vid, (X_1, Y_1), (X_2, Y_2), colors[idx], 2)
y = Y_1 - 15 if Y_1 - 15 > 15 else Y_1 + 15
cv2.putText(vid, label, (X_1, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[idx], 2)
return vid