|
import gradio as gr |
|
import cv2 |
|
from transformers import YolosImageProcessor, YolosForObjectDetection |
|
from PIL import Image |
|
import torch |
|
|
|
|
|
model = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny') |
|
image_processor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny") |
|
|
|
def process_frame(frame): |
|
|
|
frame = cv2.resize(frame, (640, 360)) |
|
|
|
|
|
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) |
|
|
|
|
|
inputs = image_processor(images=image, return_tensors="pt") |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
|
|
target_sizes = torch.tensor([image.size[::-1]]) |
|
results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[0] |
|
|
|
|
|
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): |
|
box = [round(i, 2) for i in box.tolist()] |
|
cv2.rectangle(frame, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (0, 255, 0), 2) |
|
cv2.putText(frame, f"{model.config.id2label[label.item()]}: {round(score.item(), 2)}", |
|
(int(box[0]), int(box[1])-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) |
|
|
|
return frame |
|
|
|
def video_object_detection(video): |
|
cap = cv2.VideoCapture(video) |
|
processed_frames = [] |
|
|
|
while cap.isOpened(): |
|
ret, frame = cap.read() |
|
if not ret: |
|
break |
|
|
|
|
|
|
|
processed_frame = process_frame(frame) |
|
processed_frames.append(processed_frame) |
|
|
|
cap.release() |
|
|
|
|
|
height, width, _ = processed_frames[0].shape |
|
output_video = cv2.VideoWriter('/tmp/output.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 20, (width, height)) |
|
|
|
for frame in processed_frames: |
|
output_video.write(frame) |
|
|
|
output_video.release() |
|
|
|
return '/tmp/output.mp4' |
|
|
|
|
|
iface = gr.Interface(fn=video_object_detection, inputs="video", outputs="video", title="YOLOs-Tiny Video Detection", live=True) |
|
iface.launch() |
|
|