| import cv2 |
| import numpy as np |
| import gradio as gr |
| from ultralytics import YOLO |
| from PIL import Image |
| import torch |
| from torchvision import models, transforms |
|
|
| |
| model = YOLO('yolov8n.pt') |
|
|
| |
| faster_rcnn = models.detection.fasterrcnn_resnet50_fpn(pretrained=True) |
| faster_rcnn.eval() |
|
|
| |
| confidence_threshold = 0.25 |
| iou_threshold = 0.45 |
| LOW_RES = (640, 320) |
|
|
| |
| def detect_and_draw(frame): |
| low_res_frame = cv2.resize(frame, LOW_RES) |
| results = model.predict(source=low_res_frame, conf=confidence_threshold, iou=iou_threshold, verbose=False) |
| scale_x = frame.shape[1] / LOW_RES[0] |
| scale_y = frame.shape[0] / LOW_RES[1] |
| for detection in results[0].boxes: |
| x1, y1, x2, y2 = detection.xyxy[0] * np.array([scale_x, scale_y, scale_x, scale_y]) |
| confidence = detection.conf[0] |
| cls_id = int(detection.cls[0]) |
| label = f"{model.names[cls_id]} {confidence:.2f}" |
| cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2) |
| cv2.putText(frame, label, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2) |
| return frame |
|
|
| |
| def generate_caption(image_frame): |
| transform = transforms.Compose([ |
| transforms.ToTensor() |
| ]) |
| image_tensor = transform(image_frame).unsqueeze(0) |
|
|
| with torch.no_grad(): |
| outputs = faster_rcnn(image_tensor) |
| |
| captions = [] |
| for box, label, score in zip(outputs[0]['boxes'], outputs[0]['labels'], outputs[0]['scores']): |
| if score > confidence_threshold: |
| captions.append(f"Object {label} detected with confidence {score:.2f}") |
| |
| return " ".join(captions) |
|
|
| |
| stream_url = "https://edge01.london.nginx.hdontap.com/hosb5/ng_showcase-coke_bottle-street_fixed.stream/chunklist_w464099566.m3u8" |
|
|
| |
| def process_stream(): |
| cap = cv2.VideoCapture(stream_url) |
| if not cap.isOpened(): |
| return None |
| frame_count = 0 |
| while cap.isOpened(): |
| ret, frame = cap.read() |
| if not ret: |
| break |
| frame_count += 1 |
| if frame_count % 10 == 0: |
| result = detect_and_draw(frame) |
| caption = generate_caption(frame) |
| result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB) |
| print(f"Caption: {caption}") |
| yield result_rgb |
| cap.release() |
|
|
| |
| def predict_image(image): |
| results = model.predict(source=image, conf=confidence_threshold) |
| annotated_image = results[0].plot() |
| object_count = len(results[0].boxes) |
| |
| |
| caption = generate_caption(image) |
| |
| return annotated_image, f"Objects detected: {object_count}, Caption: {caption}" |
|
|
| |
| with gr.Blocks() as demo: |
| gr.Markdown("## YOLOv8 Object Detection with Image Captioning (Faster R-CNN)") |
| with gr.Tab("Live Video"): |
| gr.Markdown("### Real-Time Object Detection and Captioning from Live Stream") |
| live_output = gr.Image(label="Live Video with YOLOv8 Annotations", streaming=True) |
| live_output.change(fn=process_stream, inputs=None, outputs=live_output) |
| with gr.Tab("Upload Image"): |
| gr.Markdown("### Object Detection and Captioning from Uploaded Image") |
| uploaded_image = gr.Image(type="numpy", label="Upload Image") |
| image_output = gr.Image(label="Annotated Image with YOLOv8 Annotations") |
| object_count_image = gr.Textbox(label="Object Count and Caption", interactive=False) |
| |
| uploaded_image.change(fn=predict_image, inputs=uploaded_image, outputs=[image_output, object_count_image]) |
|
|
| |
| if __name__ == "__main__": |
| if torch.cuda.is_available(): |
| model.to('cuda') |
| demo.queue() |
| demo.launch() |