import cv2 import numpy as np from ultralytics import YOLO, SAM import argparse def process_video(video_path, output_path): # Load models - Ultralytics will handle caching automatically yolo_model = YOLO("yolo11n.pt") sam2_model = SAM("sam2_b.pt") cap = cv2.VideoCapture(video_path) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = cap.get(cv2.CAP_PROP_FPS) fourcc = cv2.VideoWriter_fourcc(*"mp4v") out = cv2.VideoWriter(output_path, fourcc, fps, (width, height)) while cap.isOpened(): ret, frame = cap.read() if not ret: break # Detect people using YOLO model yolo_results = yolo_model(frame) # Filter for person class (adjust the class index if needed) person_boxes = yolo_results[0].boxes[yolo_results[0].boxes.cls == 0].xyxy.cpu().numpy() # Use SAM 2 for segmentation sam_results = sam2_model(frame, bboxes=person_boxes) # Combine all person masks combined_mask = np.zeros(frame.shape[:2], dtype=bool) for mask in sam_results[0].masks.data: combined_mask |= mask.cpu().numpy() # Apply the mask to the original frame segmented_frame = frame.copy() segmented_frame[~combined_mask] = [ 0, 255, 0, ] # Green background, you can change this out.write(segmented_frame) cap.release() out.release() def main(): parser = argparse.ArgumentParser(description="Process video with YOLO and SAM2") parser.add_argument("input_video", help="Path to the input video file") parser.add_argument("output_video", help="Path to the output video file") args = parser.parse_args() process_video(args.input_video, args.output_video) if __name__ == "__main__": main()