Spaces:

Dorn4449
/

yolodo

Runtime error

App Files Files Community

yolodo / app.py

Dorn4449

Update app.py

8d6fec1 verified over 1 year ago

raw

history blame contribute delete

4.06 kB

	import cv2
	import numpy as np
	import gradio as gr
	from ultralytics import YOLO
	from PIL import Image
	import torch
	from torchvision import models, transforms

	# Load YOLOv8 model for object detection
	model = YOLO('yolov8n.pt')

	# Load Faster R-CNN model
	faster_rcnn = models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
	faster_rcnn.eval()

	# Define confidence and IOU thresholds
	confidence_threshold = 0.25
	iou_threshold = 0.45
	LOW_RES = (640, 320)

	# Function to detect objects and draw bounding boxes
	def detect_and_draw(frame):
	low_res_frame = cv2.resize(frame, LOW_RES)
	results = model.predict(source=low_res_frame, conf=confidence_threshold, iou=iou_threshold, verbose=False)
	scale_x = frame.shape[1] / LOW_RES[0]
	scale_y = frame.shape[0] / LOW_RES[1]
	for detection in results[0].boxes:
	x1, y1, x2, y2 = detection.xyxy[0] * np.array([scale_x, scale_y, scale_x, scale_y])
	confidence = detection.conf[0]
	cls_id = int(detection.cls[0])
	label = f"{model.names[cls_id]} {confidence:.2f}"
	cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
	cv2.putText(frame, label, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
	return frame

	# Function to generate captions using Faster R-CNN
	def generate_caption(image_frame):
	transform = transforms.Compose([
	transforms.ToTensor()
	])
	image_tensor = transform(image_frame).unsqueeze(0)

	with torch.no_grad():
	outputs = faster_rcnn(image_tensor)

	captions = []
	for box, label, score in zip(outputs[0]['boxes'], outputs[0]['labels'], outputs[0]['scores']):
	if score > confidence_threshold:
	captions.append(f"Object {label} detected with confidence {score:.2f}")

	return " ".join(captions)

	# Define the stream URL for live video
	stream_url = "https://edge01.london.nginx.hdontap.com/hosb5/ng_showcase-coke_bottle-street_fixed.stream/chunklist_w464099566.m3u8"

	# Process video stream and generate captions
	def process_stream():
	cap = cv2.VideoCapture(stream_url)
	if not cap.isOpened():
	return None
	frame_count = 0
	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break
	frame_count += 1
	if frame_count % 10 == 0: # Process every 10th frame for efficiency
	result = detect_and_draw(frame)
	caption = generate_caption(frame)
	result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)
	print(f"Caption: {caption}")
	yield result_rgb
	cap.release()

	# Function to predict and annotate an uploaded image
	def predict_image(image):
	results = model.predict(source=image, conf=confidence_threshold)
	annotated_image = results[0].plot()
	object_count = len(results[0].boxes)

	# Generate caption for the uploaded image
	caption = generate_caption(image)

	return annotated_image, f"Objects detected: {object_count}, Caption: {caption}"

	# Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("## YOLOv8 Object Detection with Image Captioning (Faster R-CNN)")
	with gr.Tab("Live Video"):
	gr.Markdown("### Real-Time Object Detection and Captioning from Live Stream")
	live_output = gr.Image(label="Live Video with YOLOv8 Annotations", streaming=True)
	live_output.change(fn=process_stream, inputs=None, outputs=live_output)
	with gr.Tab("Upload Image"):
	gr.Markdown("### Object Detection and Captioning from Uploaded Image")
	uploaded_image = gr.Image(type="numpy", label="Upload Image")
	image_output = gr.Image(label="Annotated Image with YOLOv8 Annotations")
	object_count_image = gr.Textbox(label="Object Count and Caption", interactive=False)

	uploaded_image.change(fn=predict_image, inputs=uploaded_image, outputs=[image_output, object_count_image])

	# Launch the Gradio interface
	if __name__ == "__main__":
	if torch.cuda.is_available():
	model.to('cuda')
	demo.queue()
	demo.launch()