innodatalabs
/

fed-vitpose

Model card Files Files and versions

fed-vitpose / inference.py

MatthewTL's picture

Add ViTPoser intference API

f9f18ae 5 months ago

history blame contribute delete

1.51 kB

	from transformers import (
	VitPoseForPoseEstimation,
	AutoProcessor,
	RTDetrForObjectDetection,
	)
	from PIL import Image
	import torch

	# load models
	det_proc = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
	det_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365").eval()

	pose_proc = AutoProcessor.from_pretrained("usyd-community/vitpose-base-simple")
	pose_model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple").eval()

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	det_model.to(device)
	pose_model.to(device)

	# Hugging Face will call this function
	def predict(inputs: dict) -> dict:
	"""
	inputs: {"image": PIL.Image}
	returns: {"poses": ...}
	"""
	image = inputs["image"]

	# detect people
	det_inputs = det_proc(images=image, return_tensors="pt").to(device)
	det_outputs = det_model(**det_inputs)
	results = det_proc.post_process_object_detection(
	det_outputs,
	threshold=0.5,
	target_sizes=[(image.height, image.width)]
	)
	# keep only "person" class (label 0)
	person_boxes = results[0]["boxes"][results[0]["labels"] == 0]

	# run pose estimation
	pose_inputs = pose_proc(image, boxes=[person_boxes], return_tensors="pt").to(device)
	with torch.no_grad():
	pose_outputs = pose_model(**pose_inputs)
	poses = pose_proc.post_process_pose_estimation(pose_outputs, boxes=[person_boxes])

	return {"poses": poses[0]}