import os import logging import cv2 import numpy as np from typing import List import torch import random from ultralytics import YOLOWorld import json class YoloWorld: def __init__(self,model_name = "yolov8x-worldv2.pt"): self.model = YOLOWorld(model_name) self.model.to(device='cpu') def run_inference(self,image_path:str,object_prompts:List): object_details = [] self.model.set_classes(object_prompts) results = self.model.predict(image_path) for result in results: for box in result.boxes: object_data = {} x1, y1, x2, y2 = np.array(box.xyxy.cpu(), dtype=np.int32).squeeze() c1,c2 = (x1,y1),(x2,y2) confidence = round(float(box.conf.cpu()),2) label = f'{results[0].names[int(box.cls)]}' # [{100*round(confidence,2)}%]' print("Object Name :{} Bounding Box:{},{} Confidence score {}\n ".format(label ,c1 ,c2,confidence)) object_data[label] = { 'bounding_box':[x1,y1,x2,y2], 'confidence':confidence } object_details.append(object_data) return object_details @staticmethod def draw_bboxes(rgb_frame,boxes,labels,line_thickness=3): rgb_frame = cv2.cvtColor(cv2.imread(rgb_frame),cv2.COLOR_BGR2RGB) tl = line_thickness or round(0.002 * (rgb_frame.shape[0] + rgb_frame.shape[1]) / 2) + 1 # line/font thickness rgb_frame_copy = rgb_frame.copy() color_dict = {} # color = color or [random.randint(0, 255) for _ in range(3)] for item in np.unique(np.asarray(labels)): color_dict[item] = [random.randint(28, 255) for _ in range(3)] for box,label in zip(boxes,labels): if box.type() == 'torch.IntTensor': box = box.numpy() # extract coordinates x1,y1,x2,y2 = box c1,c2 = (x1,y1),(x2,y2) # Draw rectangle cv2.rectangle(rgb_frame_copy, c1,c2, color_dict[label], thickness=tl, lineType=cv2.LINE_AA) tf = max(tl - 1, 1) # font thickness # label = label2id[int(label.numpy())] t_size = cv2.getTextSize(str(label), 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.putText(rgb_frame_copy, str(label), (c1[0], c1[1] - 2), 0, tl / 3, color_dict[label], thickness=tf, lineType=cv2.LINE_AA) return rgb_frame_copy def format_detections(self,boxes,labels): text = "" for box ,label in zip(boxes,labels): text+="{}\tBounding Box :{}\n".format(label,box) return (text) def run_yolo_infer(self,image_path:str,object_prompts:List): processed_predictions = [] bounding_boxes = [] labels = [] scores = [] self.model.set_classes(object_prompts) results = self.model.predict(image_path,conf=0.40) for result in results: for i,box in enumerate(result.boxes): x1, y1, x2, y2 = np.array(box.xyxy.cpu(), dtype=np.int32).squeeze() bounding_boxes.append([x1,y1,x2,y2]) labels.append(result.names[int(box.cls.cpu())]) scores.append(round(float(box.conf.cpu()),2)) processed_predictions.append(dict( boxes= torch.tensor(bounding_boxes), labels= labels, scores= torch.tensor(scores) ) ) detected_image = self.draw_bboxes(rgb_frame=image_path, boxes=processed_predictions[0]['boxes'], labels=processed_predictions[0]['labels'] ) predicted_data = self.format_detections(bounding_boxes,labels) # save image cv2.imwrite('final_mask.png', cv2.cvtColor(detected_image,cv2.COLOR_BGR2RGB)) return "Predicted image : final_mask.png .\nDetails :\n{}".format(predicted_data) if __name__ == "__main__": yolo = YoloWorld() predicted_data = yolo.run_yolo_infer('../image_store/demo2.jpg',['person','hat','building']) print(predicted_data)