Spaces:

facebook
/

sapiens-pose

Running on Zero

App Files Files Community

sapiens-pose / app.py

rawalkhirodkar

Add better checkpoints

2d4ff18 about 2 months ago

raw

history blame

17.4 kB

	import os
	from typing import List
	import spaces
	import gradio as gr
	import numpy as np
	import torch
	import json
	import tempfile
	import torch.nn.functional as F
	from torchvision import transforms
	from PIL import Image
	import cv2
	from gradio.themes.utils import sizes
	from classes_and_palettes import (
	COCO_KPTS_COLORS,
	COCO_WHOLEBODY_KPTS_COLORS,
	GOLIATH_KPTS_COLORS,
	GOLIATH_SKELETON_INFO,
	GOLIATH_KEYPOINTS
	)

	import os
	import sys
	import subprocess
	import importlib.util

	def is_package_installed(package_name):
	return importlib.util.find_spec(package_name) is not None

	def find_wheel(package_path):
	dist_dir = os.path.join(package_path, "dist")
	if os.path.exists(dist_dir):
	wheel_files = [f for f in os.listdir(dist_dir) if f.endswith('.whl')]
	if wheel_files:
	return os.path.join(dist_dir, wheel_files[0])
	return None

	def install_from_wheel(package_name, package_path):
	wheel_file = find_wheel(package_path)
	if wheel_file:
	print(f"Installing {package_name} from wheel: {wheel_file}")
	subprocess.check_call([sys.executable, "-m", "pip", "install", wheel_file])
	else:
	print(f"{package_name} wheel not found in {package_path}. Please build it first.")
	sys.exit(1)

	def install_local_packages():
	packages = [
	("mmengine", "./external/engine"),
	("mmcv", "./external/cv"),
	("mmdet", "./external/det")
	]

	for package_name, package_path in packages:
	if not is_package_installed(package_name):
	print(f"Installing {package_name}...")
	install_from_wheel(package_name, package_path)
	else:
	print(f"{package_name} is already installed.")

	# Run the installation at the start of your app
	install_local_packages()

	from detector_utils import (
	adapt_mmdet_pipeline,
	init_detector,
	process_images_detector,
	)

	class Config:
	ASSETS_DIR = os.path.join(os.path.dirname(__file__), 'assets')
	CHECKPOINTS_DIR = os.path.join(ASSETS_DIR, "checkpoints")
	CHECKPOINTS = {
	"0.3b": "sapiens_0.3b_goliath_best_goliath_AP_573_torchscript.pt2",
	"0.6b": "sapiens_0.6b_goliath_best_goliath_AP_609_torchscript.pt2",
	"1b": "sapiens_1b_goliath_best_goliath_AP_639_torchscript.pt2",
	}
	DETECTION_CHECKPOINT = os.path.join(CHECKPOINTS_DIR, 'rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth')
	DETECTION_CONFIG = os.path.join(ASSETS_DIR, 'rtmdet_m_640-8xb32_coco-person_no_nms.py')

	class ModelManager:
	@staticmethod
	def load_model(checkpoint_name: str):
	if checkpoint_name is None:
	return None
	checkpoint_path = os.path.join(Config.CHECKPOINTS_DIR, checkpoint_name)
	model = torch.jit.load(checkpoint_path)
	model.eval()
	model.to("cuda")
	return model

	@staticmethod
	@torch.inference_mode()
	def run_model(model, input_tensor):
	return model(input_tensor)

	class ImageProcessor:
	def __init__(self):
	self.transform = transforms.Compose([
	transforms.Resize((1024, 768)),
	transforms.ToTensor(),
	transforms.Normalize(mean=[123.5/255, 116.5/255, 103.5/255],
	std=[58.5/255, 57.0/255, 57.5/255])
	])
	self.detector = init_detector(
	Config.DETECTION_CONFIG, Config.DETECTION_CHECKPOINT, device='cpu'
	)
	self.detector.cfg = adapt_mmdet_pipeline(self.detector.cfg)

	def detect_persons(self, image: Image.Image):
	# Convert PIL Image to tensor
	image = np.array(image)
	image = np.expand_dims(image, axis=0)

	# Perform person detection
	bboxes_batch = process_images_detector(
	image,
	self.detector
	)
	bboxes = self.get_person_bboxes(bboxes_batch[0]) # Get bboxes for the first (and only) image

	return bboxes

	def get_person_bboxes(self, bboxes_batch, score_thr=0.3):
	person_bboxes = []
	for bbox in bboxes_batch:
	if len(bbox) == 5: # [x1, y1, x2, y2, score]
	if bbox[4] > score_thr:
	person_bboxes.append(bbox)
	elif len(bbox) == 4: # [x1, y1, x2, y2]
	person_bboxes.append(bbox + [1.0]) # Add a default score of 1.0
	return person_bboxes

	@spaces.GPU
	@torch.inference_mode()
	def estimate_pose(self, image: Image.Image, bboxes: List[List[float]], model_name: str, kpt_threshold: float):
	pose_model = ModelManager.load_model(Config.CHECKPOINTS[model_name])

	result_image = image.copy()
	all_keypoints = [] # List to store keypoints for all persons

	for bbox in bboxes:
	cropped_img = self.crop_image(result_image, bbox)
	input_tensor = self.transform(cropped_img).unsqueeze(0).to("cuda")
	heatmaps = ModelManager.run_model(pose_model, input_tensor)
	keypoints = self.heatmaps_to_keypoints(heatmaps[0].cpu().numpy(), bbox)
	all_keypoints.append(keypoints) # Collect keypoints
	result_image = self.draw_keypoints(result_image, keypoints, bbox, kpt_threshold)

	return result_image, all_keypoints

	def process_image(self, image: Image.Image, model_name: str, kpt_threshold: str):
	bboxes = self.detect_persons(image)
	result_image, keypoints = self.estimate_pose(image, bboxes, model_name, float(kpt_threshold))
	return result_image, keypoints

	def crop_image(self, image, bbox):
	if len(bbox) == 4:
	x1, y1, x2, y2 = map(int, bbox)
	elif len(bbox) >= 5:
	x1, y1, x2, y2, _ = map(int, bbox[:5])
	else:
	raise ValueError(f"Unexpected bbox format: {bbox}")

	crop = image.crop((x1, y1, x2, y2))
	return crop

	@staticmethod
	def heatmaps_to_keypoints(heatmaps, bbox):
	num_joints = heatmaps.shape[0] # Should be 308
	keypoints = {}
	x1, y1, x2, y2 = map(int, bbox[:4])
	bbox_width = x2 - x1
	bbox_height = y2 - y1

	for i, name in enumerate(GOLIATH_KEYPOINTS):
	if i < num_joints:
	heatmap = heatmaps[i]
	y, x = np.unravel_index(np.argmax(heatmap), heatmap.shape)
	conf = heatmap[y, x]
	# Convert coordinates to image frame
	x_image = x * bbox_width / 192 + x1
	y_image = y * bbox_height / 256 + y1
	keypoints[name] = (float(x_image), float(y_image), float(conf))
	return keypoints

	@staticmethod
	def draw_keypoints(image, keypoints, bbox, kpt_threshold):
	image = np.array(image)

	# Handle both 4 and 5-element bounding boxes
	if len(bbox) == 4:
	x1, y1, x2, y2 = map(int, bbox)
	elif len(bbox) >= 5:
	x1, y1, x2, y2, _ = map(int, bbox[:5])
	else:
	raise ValueError(f"Unexpected bbox format: {bbox}")

	# Calculate adaptive radius and thickness based on bounding box size
	bbox_width = x2 - x1
	bbox_height = y2 - y1
	bbox_size = np.sqrt(bbox_width * bbox_height)

	radius = max(1, int(bbox_size * 0.006)) # minimum 1 pixel
	thickness = max(1, int(bbox_size * 0.006)) # minimum 1 pixel
	bbox_thickness = max(1, thickness//4)

	cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), bbox_thickness)

	# Draw keypoints
	for i, (name, (x, y, conf)) in enumerate(keypoints.items()):
	if conf > kpt_threshold and i < len(GOLIATH_KPTS_COLORS):
	x_coord = int(x)
	y_coord = int(y)
	color = GOLIATH_KPTS_COLORS[i]
	cv2.circle(image, (x_coord, y_coord), radius, color, -1)

	# Draw skeleton
	for _, link_info in GOLIATH_SKELETON_INFO.items():
	pt1_name, pt2_name = link_info['link']
	color = link_info['color']

	if pt1_name in keypoints and pt2_name in keypoints:
	pt1 = keypoints[pt1_name]
	pt2 = keypoints[pt2_name]
	if pt1[2] > kpt_threshold and pt2[2] > kpt_threshold:
	x1_coord = int(pt1[0])
	y1_coord = int(pt1[1])
	x2_coord = int(pt2[0])
	y2_coord = int(pt2[1])
	cv2.line(image, (x1_coord, y1_coord), (x2_coord, y2_coord), color, thickness=thickness)

	return Image.fromarray(image)

	class GradioInterface:
	def __init__(self):
	self.image_processor = ImageProcessor()

	def create_interface(self):
	app_styles = """
	<style>
	/* Global Styles */
	body, #root {
	font-family: Helvetica, Arial, sans-serif;
	background-color: #1a1a1a;
	color: #fafafa;
	}
	/* Header Styles */
	.app-header {
	background: linear-gradient(45deg, #1a1a1a 0%, #333333 100%);
	padding: 24px;
	border-radius: 8px;
	margin-bottom: 24px;
	text-align: center;
	}
	.app-title {
	font-size: 48px;
	margin: 0;
	color: #fafafa;
	}
	.app-subtitle {
	font-size: 24px;
	margin: 8px 0 16px;
	color: #fafafa;
	}
	.app-description {
	font-size: 16px;
	line-height: 1.6;
	opacity: 0.8;
	margin-bottom: 24px;
	}
	/* Button Styles */
	.publication-links {
	display: flex;
	justify-content: center;
	flex-wrap: wrap;
	gap: 8px;
	margin-bottom: 16px;
	}
	.publication-link {
	display: inline-flex;
	align-items: center;
	padding: 8px 16px;
	background-color: #333;
	color: #fff !important;
	text-decoration: none !important;
	border-radius: 20px;
	font-size: 14px;
	transition: background-color 0.3s;
	}
	.publication-link:hover {
	background-color: #555;
	}
	.publication-link i {
	margin-right: 8px;
	}
	/* Content Styles */
	.content-container {
	background-color: #2a2a2a;
	border-radius: 8px;
	padding: 24px;
	margin-bottom: 24px;
	}
	/* Image Styles */
	.image-preview img {
	max-width: 512px;
	max-height: 512px;
	margin: 0 auto;
	border-radius: 4px;
	display: block;
	object-fit: contain;
	}
	/* Control Styles */
	.control-panel {
	background-color: #333;
	padding: 16px;
	border-radius: 8px;
	margin-top: 16px;
	}
	/* Gradio Component Overrides */
	.gr-button {
	background-color: #4a4a4a;
	color: #fff;
	border: none;
	border-radius: 4px;
	padding: 8px 16px;
	cursor: pointer;
	transition: background-color 0.3s;
	}
	.gr-button:hover {
	background-color: #5a5a5a;
	}
	.gr-input, .gr-dropdown {
	background-color: #3a3a3a;
	color: #fff;
	border: 1px solid #4a4a4a;
	border-radius: 4px;
	padding: 8px;
	}
	.gr-form {
	background-color: transparent;
	}
	.gr-panel {
	border: none;
	background-color: transparent;
	}
	/* Override any conflicting styles from Bulma */
	.button.is-normal.is-rounded.is-dark {
	color: #fff !important;
	text-decoration: none !important;
	}
	</style>
	"""

	header_html = f"""
	<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.3/css/bulma.min.css">
	<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
	{app_styles}
	<div class="app-header">
	<h1 class="app-title">Sapiens: Pose Estimation</h1>
	<h2 class="app-subtitle">ECCV 2024 (Oral)</h2>
	<p class="app-description">
	Meta presents Sapiens, foundation models for human tasks pretrained on 300 million human images.
	This demo showcases the finetuned pose estimation model. <br>
	</p>
	<div class="publication-links">
	<a href="https://arxiv.org/abs/2408.12569" class="publication-link">
	<i class="fas fa-file-pdf"></i>arXiv
	</a>
	<a href="https://github.com/facebookresearch/sapiens" class="publication-link">
	<i class="fab fa-github"></i>Code
	</a>
	<a href="https://about.meta.com/realitylabs/codecavatars/sapiens/" class="publication-link">
	<i class="fas fa-globe"></i>Meta
	</a>
	<a href="https://rawalkhirodkar.github.io/sapiens" class="publication-link">
	<i class="fas fa-chart-bar"></i>Results
	</a>
	</div>
	<div class="publication-links">
	<a href="https://huggingface.co/spaces/facebook/sapiens_pose" class="publication-link">
	<i class="fas fa-user"></i>Demo-Pose
	</a>
	<a href="https://huggingface.co/spaces/facebook/sapiens_seg" class="publication-link">
	<i class="fas fa-puzzle-piece"></i>Demo-Seg
	</a>
	<a href="https://huggingface.co/spaces/facebook/sapiens_depth" class="publication-link">
	<i class="fas fa-cube"></i>Demo-Depth
	</a>
	<a href="https://huggingface.co/spaces/facebook/sapiens_normal" class="publication-link">
	<i class="fas fa-vector-square"></i>Demo-Normal
	</a>
	</div>
	</div>
	"""

	js_func = """
	function refresh() {
	const url = new URL(window.location);
	if (url.searchParams.get('__theme') !== 'dark') {
	url.searchParams.set('__theme', 'dark');
	window.location.href = url.href;
	}
	}
	"""

	def process_image(image, model_name, kpt_threshold):
	result_image, keypoints = self.image_processor.process_image(image, model_name, kpt_threshold)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode='w') as json_file:
	json.dump(keypoints, json_file)
	json_file_path = json_file.name
	return result_image, json_file_path

	with gr.Blocks(js=js_func, theme=gr.themes.Default()) as demo:
	gr.HTML(header_html)
	with gr.Row(elem_classes="content-container"):
	with gr.Column():
	input_image = gr.Image(label="Input Image", type="pil", format="png", elem_classes="image-preview")
	with gr.Row():
	model_name = gr.Dropdown(
	label="Model Size",
	choices=list(Config.CHECKPOINTS.keys()),
	value="1b",
	)
	kpt_threshold = gr.Dropdown(
	label="Min Keypoint Confidence",
	choices=["0.1", "0.2", "0.3", "0.4", "0.5", "0.6", "0.7", "0.8", "0.9"],
	value="0.3",
	)
	example_model = gr.Examples(
	inputs=input_image,
	examples_per_page=14,
	examples=[
	os.path.join(Config.ASSETS_DIR, "images", img)
	for img in os.listdir(os.path.join(Config.ASSETS_DIR, "images"))
	],
	)
	with gr.Column():
	result_image = gr.Image(label="Pose-308 Result", type="pil", elem_classes="image-preview")
	json_output = gr.File(label="Pose-308 Output (.json)")
	run_button = gr.Button("Run")

	run_button.click(
	fn=process_image,
	inputs=[input_image, model_name, kpt_threshold],
	outputs=[result_image, json_output],
	)

	return demo

	def main():
	if torch.cuda.is_available():
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True

	interface = GradioInterface()
	demo = interface.create_interface()
	demo.launch(share=False)

	if __name__ == "__main__":
	main()