Spaces:

Hbvsa
/

Custom_object_detection

Paused

App Files Files Community

Hbvsa commited on Jun 19

Commit

d9cb2e6

•

1 Parent(s): d8bad69

Update app.py

Browse files

Files changed (1) hide show

app.py +119 -119

app.py CHANGED Viewed

@@ -96,133 +96,133 @@ class ImageClassifier:
         logit, predicted = torch.max(output.data, 1)
         return self.labels[predicted[0].item()], logit[0].item()
-class VideoObjectDetection:
-    def __init__(self,
-                 text_prompt: str
-                 ):
-        self.text_prompt = text_prompt
-    def crop(self, frame, boxes):
-        h, w, _ = frame.shape
-        boxes = boxes * torch.Tensor([w, h, w, h])
-        xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
-        min_col, min_row, max_col, max_row = map(int, xyxy[0])
-        crop_image = frame[min_row:max_row, min_col:max_col, :]
-        return crop_image
-    def annotate(self,
-                 image_source: np.ndarray,
-                 boxes: torch.Tensor,
-                 logits: torch.Tensor,
-                 phrases: List[str],
-                 frame_rgb: np.ndarray,
-                 classifier) -> np.ndarray:
-        h, w, _ = image_source.shape
-        boxes = boxes * torch.Tensor([w, h, w, h])
-        xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
-        detections = sv.Detections(xyxy=xyxy)
-        print(xyxy.shape)
-        custom_labels = []
-        custom_logits = []
-        for box in xyxy:
-            min_col, min_row, max_col, max_row = map(int, box)
-            crop_image = frame_rgb[min_row:max_row, min_col:max_col, :]
-            label, logit = classifier.predict(crop_image)
-            print()
-            if logit >= 1:
-                custom_labels.append(label)
-                custom_logits.append(logit)
-            else:
-                custom_labels.append('unknown human face')
-                custom_logits.append(logit)
-        labels = [
-            f"{phrase} {logit:.2f}"
-            for phrase, logit
-            in zip(custom_labels, custom_logits)
-        ]
-        box_annotator = sv.BoxAnnotator()
-        annotated_frame = box_annotator.annotate(scene=image_source, detections=detections, labels=labels)
-        return annotated_frame
-    def preprocess_image(self, image: np.ndarray) -> torch.Tensor:
-        transform = T.Compose(
-            [
-                T.RandomResize([800], max_size=1333),
-                T.ToTensor(),
-                T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-            ]
-        )
-        image_pillow = Image.fromarray(image)
-        image_transformed, _ = transform(image_pillow, None)
-        return image_transformed
-    def generate_video(self, video_path) -> None:
-        # Load model, set up variables and get video properties
-        cap, fps, width, height, fourcc = get_video_properties(video_path)
-        model = load_model("GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
-                           "GroundingDINO/weights/groundingdino_swint_ogc.pth")
-        predictor = ImageClassifier()
-        TEXT_PROMPT = self.text_prompt
-        BOX_TRESHOLD = 0.6
-        TEXT_TRESHOLD = 0.6
-        # Read video frames, crop image based on text prompt object detection and generate dataset_train
-        import time
-        frame_count = 0
-        delay = 1 / fps  # Delay in seconds between frames
-        while cap.isOpened():
-            start_time = time.time()
-            ret, frame = cap.read()
-            if not ret:
-                break
-            if cv2.waitKey(1) & 0xff == ord('q'):
-                break
-            # Convert bgr frame to rgb frame to image to torch tensor transformed
-            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            image_transformed = self.preprocess_image(frame_rgb)
-            boxes, logits, phrases = predict(
-                model=model,
-                image=image_transformed,
-                caption=TEXT_PROMPT,
-                box_threshold=BOX_TRESHOLD,
-                text_threshold=TEXT_TRESHOLD
-            )
-            # Get boxes
-            if boxes.size()[0] > 0:
-                annotated_frame = self.annotate(image_source=frame, boxes=boxes, logits=logits,
-                                                phrases=phrases, frame_rgb=frame_rgb, classifier=predictor)
-                # cv2.imshow('Object detection', annotated_frame)
-                frame_rgb = cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB)
-            yield frame_rgb
-            elapsed_time = time.time() - start_time
-            time_to_wait = max(delay - elapsed_time, 0)
-            time.sleep(time_to_wait)
-        frame_count += 1
 @spaces.GPU()
 def pipeline_to_setup_with_gpu():
     run_commands()
     from GroundingDINO.groundingdino.util.inference import load_model, predict
     import GroundingDINO.groundingdino.datasets.transforms as T
     video_annotator = VideoObjectDetection(
             text_prompt='human face')

         logit, predicted = torch.max(output.data, 1)
         return self.labels[predicted[0].item()], logit[0].item()
 @spaces.GPU()
 def pipeline_to_setup_with_gpu():
     run_commands()
     from GroundingDINO.groundingdino.util.inference import load_model, predict
     import GroundingDINO.groundingdino.datasets.transforms as T
+    class VideoObjectDetection:
+        def __init__(self,
+                     text_prompt: str
+                     ):
+            self.text_prompt = text_prompt
+        def crop(self, frame, boxes):
+            h, w, _ = frame.shape
+            boxes = boxes * torch.Tensor([w, h, w, h])
+            xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
+            min_col, min_row, max_col, max_row = map(int, xyxy[0])
+            crop_image = frame[min_row:max_row, min_col:max_col, :]
+            return crop_image
+        def annotate(self,
+                     image_source: np.ndarray,
+                     boxes: torch.Tensor,
+                     logits: torch.Tensor,
+                     phrases: List[str],
+                     frame_rgb: np.ndarray,
+                     classifier) -> np.ndarray:
+            h, w, _ = image_source.shape
+            boxes = boxes * torch.Tensor([w, h, w, h])
+            xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
+            detections = sv.Detections(xyxy=xyxy)
+            print(xyxy.shape)
+            custom_labels = []
+            custom_logits = []
+            for box in xyxy:
+                min_col, min_row, max_col, max_row = map(int, box)
+                crop_image = frame_rgb[min_row:max_row, min_col:max_col, :]
+                label, logit = classifier.predict(crop_image)
+                print()
+                if logit >= 1:
+                    custom_labels.append(label)
+                    custom_logits.append(logit)
+                else:
+                    custom_labels.append('unknown human face')
+                    custom_logits.append(logit)
+            labels = [
+                f"{phrase} {logit:.2f}"
+                for phrase, logit
+                in zip(custom_labels, custom_logits)
+            ]
+            box_annotator = sv.BoxAnnotator()
+            annotated_frame = box_annotator.annotate(scene=image_source, detections=detections, labels=labels)
+            return annotated_frame
+        def preprocess_image(self, image: np.ndarray) -> torch.Tensor:
+            transform = T.Compose(
+                [
+                    T.RandomResize([800], max_size=1333),
+                    T.ToTensor(),
+                    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+                ]
+            )
+            image_pillow = Image.fromarray(image)
+            image_transformed, _ = transform(image_pillow, None)
+            return image_transformed
+        def generate_video(self, video_path) -> None:
+            # Load model, set up variables and get video properties
+            cap, fps, width, height, fourcc = get_video_properties(video_path)
+            model = load_model("GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
+                               "GroundingDINO/weights/groundingdino_swint_ogc.pth")
+            predictor = ImageClassifier()
+            TEXT_PROMPT = self.text_prompt
+            BOX_TRESHOLD = 0.6
+            TEXT_TRESHOLD = 0.6
+            # Read video frames, crop image based on text prompt object detection and generate dataset_train
+            import time
+            frame_count = 0
+            delay = 1 / fps  # Delay in seconds between frames
+            while cap.isOpened():
+                start_time = time.time()
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                if cv2.waitKey(1) & 0xff == ord('q'):
+                    break
+                # Convert bgr frame to rgb frame to image to torch tensor transformed
+                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                image_transformed = self.preprocess_image(frame_rgb)
+                boxes, logits, phrases = predict(
+                    model=model,
+                    image=image_transformed,
+                    caption=TEXT_PROMPT,
+                    box_threshold=BOX_TRESHOLD,
+                    text_threshold=TEXT_TRESHOLD
+                )
+                # Get boxes
+                if boxes.size()[0] > 0:
+                    annotated_frame = self.annotate(image_source=frame, boxes=boxes, logits=logits,
+                                                    phrases=phrases, frame_rgb=frame_rgb, classifier=predictor)
+                    # cv2.imshow('Object detection', annotated_frame)
+                    frame_rgb = cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB)
+                yield frame_rgb
+                elapsed_time = time.time() - start_time
+                time_to_wait = max(delay - elapsed_time, 0)
+                time.sleep(time_to_wait)
+            frame_count += 1
     video_annotator = VideoObjectDetection(
             text_prompt='human face')