Spaces:

Saiky2k
/

DepthPro_CVProject

Sleeping

App Files Files Community

DepthPro_CVProject / app.py

Saiky2k

Update app.py

d3075f7 verified 8 months ago

raw

history blame contribute delete

12.5 kB

	# app.py
	import streamlit as st
	from PIL import Image
	import cv2
	import numpy as np
	import torch
	from ultralytics import YOLO
	import time
	import tempfile
	import os
	import requests
	from io import BytesIO

	# Tạo module depth_pro đơn giản (để thay thế module gốc)
	class DepthPro:
	@staticmethod
	def create_model_and_transforms():
	# Nhập các thư viện cần thiết ở đây để tránh lỗi khi khởi tạo
	import torch
	from transformers import AutoImageProcessor, AutoModelForDepthEstimation

	# Tải mô hình depth estimation từ Hugging Face
	processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-nyu")
	model = AutoModelForDepthEstimation.from_pretrained("vinvino02/glpn-nyu")

	# Tạo hàm transform đơn giản
	def transform(image):
	return processor(images=image, return_tensors="pt").pixel_values

	# Mở rộng model với phương thức infer
	def infer_method(self, image, f_px=None):
	with torch.no_grad():
	outputs = self(image)
	predicted_depth = outputs.predicted_depth

	# Chuẩn hóa độ sâu
	depth_min = torch.min(predicted_depth)
	depth_max = torch.max(predicted_depth)
	predicted_depth = (predicted_depth - depth_min) / (depth_max - depth_min)
	predicted_depth = predicted_depth * 10 # Nhân với 10 để có giá trị mét hợp lý hơn

	return {"depth": predicted_depth}

	# Thêm phương thức infer vào model
	model.infer = infer_method.__get__(model)

	return model, transform

	# Hàm tải mô hình YOLO từ Hugging Face
	@st.cache_resource
	def load_yolo_model():
	# Sử dụng mô hình YOLOv8n từ Hugging Face
	model = YOLO("yolov8n.pt")
	return model

	# Hàm tải và chuẩn bị mô hình độ sâu
	@st.cache_resource
	def load_depth_model():
	depth_pro = DepthPro()
	model, transform = depth_pro.create_model_and_transforms()
	return model, transform

	# Hàm xử lý video
	def process_video(video_path):
	# Kiểm tra CUDA
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	st.info(f"Đang sử dụng thiết bị: {device}")

	# Tải mô hình YOLO
	with st.spinner('Đang tải mô hình YOLO...'):
	yolo_model = load_yolo_model()
	if device.type == 'cuda':
	yolo_model.to(device)

	# Tải mô hình độ sâu
	with st.spinner('Đang tải mô hình độ sâu...'):
	depth_model, transform = load_depth_model()
	depth_model.eval()
	if device.type == 'cuda':
	depth_model.to(device)

	# Mở video để xử lý
	cap = cv2.VideoCapture(video_path)

	# Lấy thuộc tính video cho đầu ra
	width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	fps = cap.get(cv2.CAP_PROP_FPS)
	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

	# Tạo tệp tạm thời cho video đầu ra
	temp_output_dir = tempfile.mkdtemp()
	output_video_path = os.path.join(temp_output_dir, "person_detection_with_depth.mp4")
	output_depth_path = os.path.join(temp_output_dir, "depth_colormap.mp4")

	# Sử dụng codec phù hợp với môi trường Hugging Face
	fourcc = cv2.VideoWriter_fourcc(*'XVID') # Thay đổi từ mp4v sang XVID cho tương thích tốt hơn
	out_detection = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
	out_depth = cv2.VideoWriter(output_depth_path, fourcc, fps, (width, height))

	# Ước tính chiều dài tiêu cự và chuyển đổi sang tensor
	focal_length_px = torch.tensor([max(width, height)], device=device)

	# Hiển thị thanh tiến trình
	progress_bar = st.progress(0)
	progress_text = st.empty()

	frame_counter = 0
	start_time = time.time()

	# Tạo cột để hiển thị khung video
	col1, col2 = st.columns(2)
	detection_placeholder = col1.empty()
	depth_placeholder = col2.empty()

	# Giảm kích thước frame để tăng tốc độ xử lý
	target_width = 640 # Kích thước đích
	scale_factor = target_width / width
	target_height = int(height * scale_factor)

	try:
	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break

	frame_counter += 1

	# Cập nhật tiến trình
	progress = int(frame_counter / total_frames * 100)
	progress_bar.progress(progress)

	if frame_counter % 10 == 0: # Hiển thị tiến trình mỗi 10 khung hình
	elapsed_time = time.time() - start_time
	frames_left = total_frames - frame_counter
	est_time_left = (elapsed_time / frame_counter) * frames_left if frame_counter > 0 else 0
	progress_text.text(f"Đang xử lý khung hình {frame_counter}/{total_frames} - Thời gian còn lại: {est_time_left:.2f}s")

	# Giảm kích thước khung hình để tăng tốc xử lý
	if scale_factor < 1:
	frame_resized = cv2.resize(frame, (target_width, target_height))
	else:
	frame_resized = frame

	# Phát hiện YOLO
	results = yolo_model(frame_resized)

	person_boxes = []
	for result in results:
	boxes = result.boxes.xyxy.cpu().numpy()
	classes = result.boxes.cls.cpu().numpy()
	confs = result.boxes.conf.cpu().numpy()

	for box, cls, conf in zip(boxes, classes, confs):
	if result.names[int(cls)] == "person" and conf > 0.5: # Thêm ngưỡng tin cậy
	if scale_factor < 1: # Điều chỉnh lại khung giới hạn nếu đã thay đổi kích thước
	x1, y1, x2, y2 = map(int, [box[0]/scale_factor, box[1]/scale_factor,
	box[2]/scale_factor, box[3]/scale_factor])
	else:
	x1, y1, x2, y2 = map(int, box[:4])
	person_boxes.append((x1, y1, x2, y2))
	cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

	# Chuyển đổi khung hình cho đầu vào mô hình độ sâu
	rgb_frame = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
	pil_image = Image.fromarray(rgb_frame)
	depth_input = transform(pil_image)

	if device.type == 'cuda':
	depth_input = depth_input.to(device)

	# Ước tính độ sâu
	with torch.no_grad():
	predictions = depth_model.infer(depth_input, f_px=focal_length_px)
	depth = predictions["depth"] # Độ sâu theo [m]

	depth_np = depth.squeeze().cpu().numpy()

	# Điều chỉnh lại kích thước bản đồ độ sâu
	if scale_factor < 1:
	depth_np = cv2.resize(depth_np, (width, height), interpolation=cv2.INTER_LINEAR)

	# Tạo bản đồ màu độ sâu
	depth_np_normalized = (depth_np - depth_np.min()) / (depth_np.max() - depth_np.min())
	inv_depth_np_normalized = 1 - depth_np_normalized
	depth_colormap = cv2.applyColorMap((inv_depth_np_normalized * 255).astype(np.uint8), cv2.COLORMAP_TURBO)

	# Thêm giá trị độ sâu cho người được phát hiện
	for x1, y1, x2, y2 in person_boxes:
	center_x = (x1 + x2) // 2
	center_y = (y1 + y2) // 2

	# Đảm bảo tọa độ nằm trong giới hạn
	center_x = min(center_x, depth_np.shape[1] - 1)
	center_y = min(center_y, depth_np.shape[0] - 1)

	depth_value = depth_np[center_y, center_x]

	text = f"Depth: {depth_value:.2f} m"
	font = cv2.FONT_HERSHEY_SIMPLEX
	font_scale = 0.8 # Giảm kích thước font để phù hợp
	font_thickness = 2
	text_size = cv2.getTextSize(text, font, font_scale, font_thickness)[0]

	text_x = x1
	text_y = y1 - 10
	rect_x1 = text_x - 5
	rect_y1 = text_y - text_size[1] - 10
	rect_x2 = text_x + text_size[0] + 5
	rect_y2 = text_y + 5

	cv2.rectangle(frame, (rect_x1, rect_y1), (rect_x2, rect_y2), (0, 255, 0), -1)
	cv2.putText(frame, text, (text_x, text_y), font, font_scale, (0, 0, 0), font_thickness)

	# Hiển thị khung hình trong Streamlit (cập nhật mỗi 5 khung hình để tránh làm chậm)
	if frame_counter % 5 == 0:
	detection_placeholder.image(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), caption="Detect people", use_container_width=True)
	depth_placeholder.image(depth_colormap, caption="depth: ", use_container_width=True)

	# Ghi khung hình vào video đầu ra
	out_detection.write(frame)
	out_depth.write(depth_colormap)

	finally:
	# Giải phóng tài nguyên
	cap.release()
	out_detection.release()
	out_depth.release()

	total_time = time.time() - start_time
	st.success(f"Xử lý hoàn tất! Tổng thời gian: {total_time:.2f}s")
	st.success(f"FPS trung bình: {frame_counter / total_time:.2f}")

	return output_video_path, output_depth_path

	# Giao diện Streamlit chính
	def main():
	st.title("Ứng dụng Phát hiện Người và Ước tính Độ sâu")
	st.write("Tải lên video để phát hiện người và hiển thị thông tin độ sâu")

	video_path = None
	# Tải lên tệp video
	uploaded_file = st.file_uploader("Chọn một tệp video", type=['mp4', 'avi', 'mov'])

	if uploaded_file is not None:
	# Lưu tệp đã tải lên vào thư mục tạm thời
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
	temp_file.write(uploaded_file.read())
	video_path = temp_file.name
	temp_file.close()

	st.video(video_path)

	# Hiển thị thông tin về mô hình
	st.sidebar.header("Thông tin mô hình")
	st.sidebar.markdown("""
	- Phát hiện người: YOLOv8n
	- Ước tính độ sâu: Depth_Pro
	""")

	# Thêm tùy chọn cho độ tin cậy phát hiện
	confidence = st.sidebar.slider("Ngưỡng tin cậy", 0.0, 1.0, 0.5)

	# Nút để bắt đầu xử lý
	if video_path and st.button("Xử lý Video"):
	with st.spinner("Đang xử lý video..."):
	detection_video_path, depth_video_path = process_video(video_path)

	# Hiển thị video đã xử lý
	st.subheader("Video đã xử lý")

	col1, col2 = st.columns(2)
	with col1:
	st.video(detection_video_path)
	st.download_button(
	label="Tải xuống video phát hiện",
	data=open(detection_video_path, 'rb').read(),
	file_name="person_detection_with_depth.mp4",
	mime="video/mp4"
	)

	with col2:
	st.video(depth_video_path)
	st.download_button(
	label="Tải xuống bản đồ độ sâu",
	data=open(depth_video_path, 'rb').read(),
	file_name="depth_colormap.mp4",
	mime="video/mp4"
	)

	# Xóa tệp tạm thời
	os.unlink(video_path)

	# Tệp requirements.txt
	def create_requirements():
	requirements = """
	streamlit
	numpy
	Pillow
	opencv-python
	torch
	torchvision
	transformers
	ultralytics
	requests
	opencv-python
	"""
	return requirements

	if __name__ == "__main__":
	main()