In [None]:
import os
import cv2
import torch
from torchvision import transforms
from torchvision.models import detection
from PIL import Image
import pickle
import numpy as np
import sys


In [None]:
# checks if their is a gpu present, if not uses a cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# mainly consists of the classes present in the coco dataset
classes = ['__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
           'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
           'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
           'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
           'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
           'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
           'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
           'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
           'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
           'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
           'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
           'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']

colors = np.random.uniform(0,255, size=(len(classes), 3)) # assigning a color to each classes of the data

In [4]:
# calling the Faster-RCNN ResNet50 model
model = detection.fasterrcnn_resnet50_fpn(pretrained=True, progress=True, pretrained_backbone=True).to(device)
model.eval()  # prints out the architecture of the model



FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [None]:
image = cv2.imread(os.path.join(".","data", "fam1.HEIC")) # reads the model using OpenCV

image = cv2.resize(image, (640, 480))

orig = image.copy()


image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # changing the colorspace from BGR to RGB (since Pytorch trains only RGB image)
image = image.transpose((2, 0, 1))  # swapping the color channels from channels last to channels first

image = np.expand_dims(image, axis=0)  # add batch dimension to the image
image = image / 255.0  # scaling image from (0,255) to (0,1)
image = torch.FloatTensor(image) # changes the numpy array to a tensor.


image= image.to(device)
# the image is passed to the model to get the bounding boxes
detections = model(image)[0]

In [8]:
people = 0
for i in range(0, len(detections["boxes"])):
    confidence = detections["scores"][i]  # get confidence score of each object in the image
    idx = int(detections["labels"][i])  # identifying the id of each of the classes in the image
    box = detections["boxes"][i].detach().cpu().numpy()  # gets the coordinates for the bounding boxes
    (X_1, Y_1, X_2, Y_2) = box.astype("int")

    if confidence > 0.75 and idx == 1:
        # matching the label index with its classes and its probability
        label = f"{classes[idx]}, {idx}: {confidence* 100}%"
        print(f"[INFO] {label}")
        people += 1
        cv2.rectangle(orig, (X_1, Y_1), (X_2, Y_2), colors[idx], 2)  # draw bounding boxes over each object
        y = Y_1 - 15 if Y_1 - 15 > 15 else Y_1 + 15

        # adds the label text to the image.
        cv2.putText(orig, label, (X_1, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[idx], 2)
    print(f"People: {people}")
    cv2.putText(orig, f"Number of People: {people}", (5, 19), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[idx], 2)

cv2.imwrite("./data/detected_img.jpg", orig)
cv2.imshow("Image Detection", orig)
cv2.waitKey(0)

[INFO] person, 1: 98.54804992675781%
[INFO] person, 1: 98.00418090820312%
[INFO] person, 1: 88.78005981445312%
[INFO] person, 1: 80.33210754394531%
[INFO] person, 1: 78.6150894165039%


32

In [7]:
# implementation for videos
def video_processing(video_path):
    video = cv2.VideoCapture(video_path)
    frame_width = int(video.get(3))
    frame_height = int(video.get(4))
 
    # Define the codec and create VideoWriter object.The output is stored in 'outpy.avi' file.
    out = cv2.VideoWriter('./data/outpy.mp4',cv2.VideoWriter_fourcc('M','J','P','G'), 10, (frame_width,frame_height))

    while video.isOpened():
        ret, frame = video.read()
        vid = frame.copy()
        if not ret:
          break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = transforms.functional.to_tensor(frame)
        frame = frame.to(device)
        vid_detect = model([frame])[0]    

        # return vid, vid_detect
        for i in range(0, len(vid_detect["boxes"])):
          confidence = vid_detect["scores"][i]

          if confidence > 0.75:
            idx = int(vid_detect["labels"][i])
            box = vid_detect["boxes"][i].detach().cpu().numpy()
            (X_1, Y_1, X_2, Y_2) = box.astype("int")

            label = f"{classes[idx]}, {idx}: {confidence* 100}%"
            print(f"[INFO] {label}")

            cv2.rectangle(vid, (X_1, Y_1), (X_2, Y_2), colors[idx], 2)
            y = Y_1 - 15 if Y_1 - 15 > 15 else Y_1 + 15

            cv2.putText(vid, label, (X_1, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[idx], 2)

        # cv2.imwrite("vid_detection.mp4", vid)
    
          # cv2.imshow("Video Detection", vid)
          # key = cv2.waitKey(40) & 0xFF
          # if key == ord(" "):
          #   break
    
    out.release()
    cv2.destroyAllWindows()


In [None]:
video_processing("./data/a1.mp4")

[INFO] person, 1: 99.96479797363281%
[INFO] person, 1: 99.8873291015625%
[INFO] tie, 32: 84.8359375%
[INFO] skateboard, 41: 84.439697265625%
[INFO] person, 1: 99.8567123413086%
[INFO] tie, 32: 88.97736358642578%
[INFO] person, 1: 99.85549926757812%
[INFO] tie, 32: 85.5542221069336%
[INFO] person, 1: 99.85945892333984%
[INFO] tie, 32: 76.01469421386719%
[INFO] person, 1: 99.91976928710938%
[INFO] tie, 32: 75.80422973632812%
[INFO] person, 1: 99.92200469970703%
[INFO] person, 1: 99.91909790039062%
[INFO] person, 1: 99.87517547607422%
[INFO] person, 1: 99.95360565185547%
[INFO] person, 1: 99.94219207763672%
[INFO] person, 1: 99.911865234375%
[INFO] person, 1: 99.9067153930664%
[INFO] person, 1: 99.92170715332031%
[INFO] person, 1: 99.92486572265625%
[INFO] tie, 32: 81.37924194335938%
[INFO] person, 1: 99.9447021484375%
[INFO] person, 1: 99.90923309326172%
[INFO] person, 1: 99.9104232788086%
[INFO] person, 1: 99.9041976928711%
[INFO] person, 1: 99.92522430419922%
[INFO] person, 1: 99.90491

In [None]:
vid= cv2.VideoCapture("ai.mp4")
ret = True
while ret:
  ret, frame = vid.read()

  if ret:
        cv2.imshow("Video Window", frame)
        cv2.waitKey(40)

# vid.release()
# cv2.destroyAllWindows()
