import datetime import io import time import torch import gradio as gr import cv2 from transformers import AutoFeatureExtractor, AutoModelForObjectDetection extractor = AutoFeatureExtractor.from_pretrained("hustvl/yolos-tiny") model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-tiny") BBOX_COLOR = [255, 0, 0] PRED_THRESHOLD = 0.90 def composite_predictions(img, processed_predictions, show_video=False): interested_labels = processed_predictions["labels"] == 1 # only interested in people scores = processed_predictions["scores"][interested_labels].tolist() boxes = [[int(j) for j in x] for x in processed_predictions["boxes"][interested_labels].tolist()] labels = [model.config.id2label[x] for x in processed_predictions["labels"][interested_labels].tolist()] for score, box, label in zip(scores, boxes, labels): cv2.rectangle(img, box, BBOX_COLOR, 1) cv2.putText(img, f"{label}: {score:0.2f}", (box[0]+2, box[1]+10), cv2.FONT_HERSHEY_SIMPLEX, 0.33, BBOX_COLOR, 1, cv2.LINE_AA) return img, len(boxes), datetime.datetime.now() def process(img): inputs = extractor(images=img, return_tensors="pt") outputs = model(**inputs) h, w, _ = img.shape img_size = torch.tensor([(h, w)]) processed = extractor.post_process_object_detection(outputs, PRED_THRESHOLD, img_size) # Composite image and prediction bounding boxes + labels prediction return composite_predictions(img, processed[0]) with gr.Blocks() as demo: stream = gr.State() with gr.Row(): with gr.Column(scale=1, min_width=600): last_refresh_box = gr.Textbox(label="Last updated") attendance_label = gr.Label(label="Current Attendance") with gr.Row(): with gr.Column(scale=1, min_width=600): webcam = gr.Webcam(streaming=True) output = gr.Image(label="Composite", visible=True) webcam.stream(process, [webcam], [output, attendance_label, last_refresh_box]) demo.queue().launch()