Spaces:

James040
/

Pose-Extractor-Video-MP

Sleeping

App Files Files Community

Pose-Extractor-Video-MP / app.py

James040

Update app.py

549107e verified about 1 month ago

raw

history blame contribute delete

5.61 kB

	import cv2
	import numpy as np
	import gradio as gr
	import subprocess
	import urllib.request
	import os
	import json

	# 1. Modern Tasks API
	import mediapipe as mp
	from mediapipe.tasks import python
	from mediapipe.tasks.python import vision

	# Auto-Download Model
	MODEL_PATH = "pose_landmarker_lite.task"
	MODEL_URL = "https://storage.googleapis.com/mediapipe-models/pose_landmarker/pose_landmarker_lite/float16/1/pose_landmarker_lite.task"

	if not os.path.exists(MODEL_PATH):
	print("Downloading MediaPipe Pose Model...")
	urllib.request.urlretrieve(MODEL_URL, MODEL_PATH)

	POSE_CONNECTIONS = [
	(0, 1), (1, 2), (2, 3), (3, 7), (0, 4), (4, 5), (5, 6), (6, 8), (9, 10),
	(11, 12), (11, 13), (13, 15), (15, 17), (15, 19), (15, 21), (17, 19),
	(12, 14), (14, 16), (16, 18), (16, 20), (16, 22), (18, 20), (11, 23),
	(12, 24), (23, 24), (23, 25), (24, 26), (25, 27), (26, 28), (27, 29),
	(28, 30), (29, 31), (30, 32), (27, 31), (28, 32)
	]

	def extract_pose_and_data(video_path):
	if video_path is None:
	return None, None, None

	output_video_path = "final_output.mp4"
	temp_video = "temp_silent.mp4"
	output_json_path = "pose_data.json"

	cap = cv2.VideoCapture(video_path)
	width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	fps = cap.get(cv2.CAP_PROP_FPS)

	fourcc = cv2.VideoWriter_fourcc(*'mp4v')
	out = cv2.VideoWriter(temp_video, fourcc, fps, (width, height))

	base_options = python.BaseOptions(model_asset_path=MODEL_PATH)
	options = vision.PoseLandmarkerOptions(
	base_options=base_options,
	running_mode=vision.RunningMode.VIDEO
	)

	# Storage for Blender Data
	all_frames_data = []

	with vision.PoseLandmarker.create_from_options(options) as landmarker:
	frame_idx = 0
	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break

	rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_frame)
	timestamp_ms = int((frame_idx / fps) * 1000)

	result = landmarker.detect_for_video(mp_image, timestamp_ms)
	canvas = np.zeros((height, width, 3), dtype=np.uint8)

	frame_entry = {
	"frame": frame_idx,
	"timestamp_ms": timestamp_ms,
	"landmarks": []
	}

	if result.pose_landmarks and result.pose_world_landmarks:
	# 1. Extract 3D World Data for JSON (For Blender)
	for landmark in result.pose_world_landmarks[0]:
	frame_entry["landmarks"].append({
	"x": landmark.x,
	"y": landmark.y,
	"z": landmark.z,
	"visibility": landmark.visibility
	})

	# 2. Draw 2D Data for Video (For EbSynth)
	pose = result.pose_landmarks[0]
	for connection in POSE_CONNECTIONS:
	start_idx, end_idx = connection
	start_pt, end_pt = pose[start_idx], pose[end_idx]
	start_px = (int(start_pt.x * width), int(start_pt.y * height))
	end_px = (int(end_pt.x * width), int(end_pt.y * height))
	cv2.line(canvas, start_px, end_px, (0, 255, 0), 10)

	for landmark in pose:
	px = (int(landmark.x * width), int(landmark.y * height))
	cv2.circle(canvas, px, 15, (255, 255, 255), -1)

	all_frames_data.append(frame_entry)
	out.write(canvas)
	frame_idx += 1

	cap.release()
	out.release()

	# Save the JSON file
	with open(output_json_path, 'w') as f:
	json.dump(all_frames_data, f, indent=4)

	# Merge Audio Native FFmpeg
	try:
	command = [
	"ffmpeg", "-y", "-i", temp_video, "-i", video_path,
	"-c:v", "copy", "-c:a", "aac", "-map", "0:v:0", "-map", "1:a:0?",
	"-shortest", output_video_path
	]
	subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	except Exception as e:
	print("FFmpeg error:", e)
	output_video_path = temp_video

	# Return: Video File, JSON File (for download), JSON Dictionary (for UI Copying)
	return output_video_path, output_json_path, all_frames_data

	# Gradio UI Setup
	with gr.Blocks(title="Pose & 3D Data Extractor") as interface:
	gr.Markdown("# 🕺 Pose Video & 3D JSON Extractor")
	gr.Markdown("Generates a thick stickman for EbSynth and extracts `pose_world_landmarks` (x, y, z) for Blender IK.")

	with gr.Row():
	with gr.Column():
	video_input = gr.Video(label="Upload Dancing Clip (15-30s)")
	submit_btn = gr.Button("Extract Pose & Data", variant="primary")

	with gr.Column():
	video_output = gr.Video(label="Meaty Stickman Output")
	file_output = gr.File(label="Download 3D JSON Data")

	with gr.Row():
	# The gr.JSON component automatically includes a "Copy" button in the top right
	json_output = gr.JSON(label="Raw JSON Data (Click top right to Copy)")

	submit_btn.click(
	fn=extract_pose_and_data,
	inputs=video_input,
	outputs=[video_output, file_output, json_output]
	)

	if __name__ == "__main__":
	interface.launch()