Upload folder using huggingface_hub

1146a67 verified 3 months ago

16.7 kB

	import imageio, os, torch, warnings, torchvision, argparse, json
	from peft import LoraConfig, inject_adapter_in_model
	from PIL import Image
	import pandas as pd
	from tqdm import tqdm
	from accelerate import Accelerator
	from accelerate.utils import DistributedDataParallelKwargs
	import random
	from decord import VideoReader
	from decord import cpu, gpu
	import imageio.v3 as iio

	from torchvision import transforms
	import torchvision
	import random
	import decord
	from torchvision import transforms
	import re
	decord.bridge.set_bridge('torch')
	import random
	import numpy as np
	from PIL import Image, ImageOps

	class MulltiShot_MultiView_Dataset(torch.utils.data.Dataset):
	def __init__(self, dataset_base_path='/root/paddlejob/workspace/qizipeng/baidu/personal-code/Multi-view/multi_view/datasets/merged_mark_paishe_ds_meg_merge_dwposefilter_paishe.json',
	ref_image_path='/root/paddlejob/workspace/qizipeng/code/longvideogen/output.json',
	time_division_factor=4,
	time_division_remainder=1,
	max_pixels=1920*1080,
	height_division_factor=16, width_division_factor=16,
	transform=None,
	length=None,
	resolution=None,
	prev_length=5,
	ref_num = 3,
	training = True):
	self.data_path = dataset_base_path
	self.data = []
	self.length = length
	self.resolution = resolution
	self.height, self.width = resolution
	self.num_frames = length
	self.time_division_factor = time_division_factor
	self.time_division_remainder = time_division_remainder
	self.max_pixels = max_pixels
	self.height_division_factor = height_division_factor
	self.width_division_factor = width_division_factor
	self.prev_length = prev_length
	self.training = training
	self.ref_num = ref_num

	with open(self.data_path, 'r') as f:
	meta_datas = json.load(f)

	for video_path in tqdm(meta_datas.keys()):
	context = meta_datas[video_path]
	candidate_labels = list(context.keys())
	candidate_labels.remove('text')

	disk_path = meta_datas[video_path]["disk_path"]
	if not disk_path.lower().endswith(".mp4"):
	continue


	# reader = imageio.get_reader(meta_datas[video_path]["disk_path"])
	# total_original_frames = reader.count_frames()
	# total_frame = total_original_frames # context["end_index"] - context["start_index"] - 1
	total_frame = None
	ref_id = self.get_ref_id(face_crop_angle = context['facedetect_v1'], facedetect_v1_frame_index = context['facedetect_v1_frame_index'], total_frame = total_frame)
	if ref_id == []:
	continue
	ref_id_all = []
	for ids in ref_id:
	ref_id_grop = []
	for id in ids:
	coordinate = context['facedetect_v1'][id][0]['detect']
	if context['facedetect_v1'][id][0]['detect']["prob"] < 0.99:
	continue
	top, height, width, left = coordinate['top'], coordinate['height'], coordinate['width'], coordinate['left']
	if not(min(height, width) > 80 ):
	continue
	# enlarge bbox 1.5x
	width = int(width * 1)
	height = int(height * 1)
	frame_index = context['facedetect_v1_frame_index'][id]
	ref_id_grop.append([top, height, width, left, int(frame_index)])
	if ref_id_grop != []:
	if len(ref_id_grop) >= 3: #self.ref_num: ### 为了和ref_num = 3 保持数据一致
	ref_id_all.append(ref_id_grop)
	if ref_id_all == []:
	continue
	meta_prompt = {}
	meta_prompt["global_caption"] = None
	meta_prompt["per_shot_prompt"] = []
	meta_prompt["single_prompt"] = context['text']
	self.data.append({'video_path': disk_path, 'meta_prompt': meta_prompt, "ref_id_all": ref_id_all})
	# self.data.append({'video_path':video_path, 'meta_prompt': meta_prompt, "ref_id_all": ref_id_all})

	random.seed(42) # 让每次划分一致（可选）
	total = len(self.data)
	test_count = max(1, int(total * 0.05)) # 至少一个

	# 随机选择 test 的 index
	test_indices = set(random.sample(range(total), test_count))

	self.data_test = [self.data[i] for i in range(total) if i in test_indices]
	self.data_train = [self.data[i] for i in range(total) if i not in test_indices]
	print(f"🔥 数据集划分完成：Train={len(self.data_train)}, Test={len(self.data_test)}")

	if self.height is not None and self.width is not None:
	print("Height and width are fixed. Setting `dynamic_resolution` to False.")
	self.dynamic_resolution = False
	elif self.height is None and self.width is None:
	print("Height and width are none. Setting `dynamic_resolution` to True.")
	self.dynamic_resolution = True

	def get_ref_id(self, face_crop_angle, facedetect_v1_frame_index = None, total_frame = None, angle_threshold=50):
	"""
	返回满足角度差异要求的三元组 [i, j, k]
	要求：
	- face_crop_angle[i] / [j] / [k] 都必须非空
	- i,j 两者任意 yaw/pitch/roll 差值 > angle_threshold
	- k != i != j，且 k 也必须非空
	"""
	ref_id = []
	max_try = 5
	need_max = 3
	try_num = 0

	# 过滤空元素，保留有效索引
	valid_indices = [idx for idx, item in enumerate(face_crop_angle) if item]
	N = len(valid_indices)

	if N < 3:
	return ref_id # 不足 3 张有效图，无法组成三元组

	# 两两组合检查角度差
	for a in range(N - 1):
	i = valid_indices[a]
	# if facedetect_v1_frame_index[i] > total_frame:
	# continue
	angle_i = face_crop_angle[i][0]["angle"]

	for b in range(a + 1, N):
	j = valid_indices[b]
	# if facedetect_v1_frame_index[j] > total_frame:
	# continue
	angle_j = face_crop_angle[j][0]["angle"]

	# 判断是否满足阈值
	if (
	abs(angle_i["yaw"] - angle_j["yaw"]) > angle_threshold or
	abs(angle_i["pitch"] - angle_j["pitch"]) > angle_threshold or
	abs(angle_i["roll"] - angle_j["roll"]) > angle_threshold
	):
	# 找第三个 k
	for c in range(N):
	k = valid_indices[c]
	# if facedetect_v1_frame_index[k] > total_frame:
	# continue
	if k != i and k != j:
	ref_id.append([i, j, k])
	break

	try_num += 1
	if try_num >= max_try or len(ref_id) >= need_max:
	return ref_id

	return ref_id
	def crop_and_resize(self, image, target_height, target_width):
	width, height = image.size
	scale = max(target_width / width, target_height / height)
	image = torchvision.transforms.functional.resize(
	image,
	(round(heightscale), round(widthscale)),
	interpolation=torchvision.transforms.InterpolationMode.BILINEAR
	)
	image = torchvision.transforms.functional.center_crop(image, (target_height, target_width))
	return image

	def get_height_width(self, image):
	if self.dynamic_resolution:
	width, height = image.size
	if width * height > self.max_pixels:
	scale = (width * height / self.max_pixels) ** 0.5
	height, width = int(height / scale), int(width / scale)
	height = height // self.height_division_factor * self.height_division_factor
	width = width // self.width_division_factor * self.width_division_factor
	else:
	height, width = self.height, self.width
	return height, width

	# def
	# img_ratio = img.width / img.height
	# target_ratio = w / h
	# if img_ratio > target_ratio: # Image is wider than target
	# new_width = w
	# new_height = int(new_width / img_ratio)
	# else: # Image is taller than target
	# new_height = h
	# new_width = int(new_height * img_ratio)

	# # img = img.resize((new_width, new_height), Image.ANTIALIAS)
	# img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)

	# # Create a new image with the target size and place the resized image in the center
	# delta_w = w - img.size[0]
	# delta_h = h - img.size[1]
	# padding = (delta_w // 2, delta_h // 2, delta_w - (delta_w // 2), delta_h - (delta_h // 2))
	# new_img = ImageOps.expand(img, padding, fill=(255, 255, 255))

	def resize_ref(self, img, target_h, target_w):
	h = target_h
	w = target_w
	img = img.convert("RGB")
	# Calculate the required size to keep aspect ratio and fill the rest with padding.
	img_ratio = img.width / img.height
	target_ratio = w / h

	if img_ratio > target_ratio: # Image is wider than target
	new_width = w
	new_height = int(new_width / img_ratio)
	else: # Image is taller than target
	new_height = h
	new_width = int(new_height * img_ratio)

	# img = img.resize((new_width, new_height), Image.ANTIALIAS)
	img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)

	# Create a new image with the target size and place the resized image in the center
	delta_w = w - img.size[0]
	delta_h = h - img.size[1]
	padding = (delta_w // 2, delta_h // 2, delta_w - (delta_w // 2), delta_h - (delta_h // 2))
	new_img = ImageOps.expand(img, padding, fill=(255, 255, 255))

	return new_img


	def load_video_crop_ref_image(self, video_path=None, ref_id_all=[[]]):
	### fps 转化
	reader = imageio.get_reader(video_path)
	meta = reader.get_meta_data()
	original_fps = meta.get("fps", 24)
	target_fps = 16
	duration_seconds = 5
	target_frames = target_fps * duration_seconds + 1 # = 80 frames

	# ---- 获取原视频帧数 ----
	try:
	total_original_frames = reader.count_frames()
	except:
	total_original_frames = int(meta.get("duration", 5) * original_fps)



	# ---- 需要多少原始帧（5秒）----
	need_orig_frames = int(original_fps * duration_seconds)

	# ---- Case 1: 原视频 >= 5秒 → 随机选择 5 秒起点 ----
	if total_original_frames > need_orig_frames:
	max_start = total_original_frames - need_orig_frames
	start_frame = random.randint(0, max_start)
	segment_start = start_frame
	segment_end = start_frame + need_orig_frames
	else:
	# ---- Case 2: 原视频 < 5秒 → 用全部帧 ----
	segment_start = 0
	segment_end = total_original_frames

	# ---- 均匀采样 80 帧 ----
	sample_ids = np.linspace(segment_start, segment_end - 1, num=target_frames, dtype=int)

	frames = []
	for frame_id in sample_ids:
	frame = reader.get_data(int(frame_id))
	frame = Image.fromarray(frame)
	frame = self.crop_and_resize(frame, *self.get_height_width(frame))
	frames.append(frame)

	# ===========================
	# 选择参考图部分（你要求的）
	# ===========================

	# 1）从 ref_images_all（三维 list）里随机选一组
	# ref_images_all = [ [img1, img2, img3], [imgA, imgB, imgC], ... ]
	ref_group = random.choice(ref_id_all)

	# 2）检查资源是否足够
	if len(ref_group) < self.ref_num:
	raise ValueError(f"需要 {self.ref_num} 张参考图，但该组只有 {len(ref_group)} 张。")

	# 3）从该组中随机选 self.ref_num 张
	selected_refs = random.sample(ref_group, self.ref_num)
	random.shuffle(selected_refs)

	ref_images = []
	for sf in selected_refs:
	top, height, width, left, frame_index = sf
	# import pdb; pdb.set_trace()
	if frame_index > total_original_frames:
	print(f"{video_path}, frame_index({frame_index}) out of range")
	frame = reader.get_data(int(frame_index))
	frame = Image.fromarray(frame)
	xmin, ymin, xmax, ymax = left, top, left + width, top + height
	cropped_image = frame.crop((xmin, ymin, xmax, ymax)).convert("RGB")
	cropped_image = self.resize_ref(cropped_image, self.height, self.width)
	# Calculate the required size to keep aspect ratio and fill the rest with padding.
	ref_images.append(cropped_image)
	reader.close()

	return frames, ref_images

	def __getitem__(self, index):
	max_retry = 10 # 最多重试 10 次，避免死循环
	retry = 0

	while retry < max_retry:
	# ----- 选择 train / test 数据 -----
	if self.training:
	meta_data = self.data_train[index % len(self.data_train)]
	else:
	meta_data = self.data_test[index % len(self.data_test)]

	video_path = meta_data['video_path']
	meta_prompt = meta_data['meta_prompt']
	ref_id_all = meta_data['ref_id_all']

	# ----- 尝试读取 video + ref -----
	try:
	input_video, ref_images = self.load_video_crop_ref_image(
	video_path=video_path,
	ref_id_all=ref_id_all
	)
	except Exception as e:
	print("❌ Exception in load_video_crop_ref_image")
	print(f" video_path: {video_path}")
	print(f" error type: {type(e).__name__}")
	print(f" error msg : {e}")

	# 打印 traceback，定位问题更容易
	import traceback
	traceback.print_exc()
	input_video = None
	ref_images = None
	# ----- 如果成功，并且 video 不为空，返回结果 -----
	if input_video is not None and len(input_video) > 0:
	return {
	"global_caption": None,
	"shot_num": 1,
	"pre_shot_caption": [],
	"single_caption": meta_prompt["single_prompt"],
	"video": input_video,
	"ref_num": self.ref_num,
	"ref_images": ref_images,
	"video_path": video_path
	}

	# ----- 如果失败，换 index，并继续尝试 -----
	retry += 1
	index = random.randint(0, len(self.data_train) - 1 if self.training else len(self.data_test) - 1)

	# 若 10 次都失败，返回最后一次的错误内容
	raise RuntimeError(f"❌ [Dataset] Failed to load video/ref after {max_retry} retries.")

	def __len__(self):
	if self.training:
	return len(self.data_train)
	else:
	return len(self.data_test)

	if __name__ == '__main__':
	from torch.utils.data import DataLoader
	dataset = MulltiShot_MultiView_Dataset(length=49, resolution=(384, 640), training=True)
	print(len(dataset))
	metadata = dataset[0]
	# results = dataset[0]
	# loader = DataLoader(
	# dataset,
	# batch_size=1, # 视频一般 batch=1
	# shuffle=False, # 你想打乱就 True
	# num_workers=10, # ⭐ 重点：开启 8 个子进程并行加载
	# pin_memory=True,
	# prefetch_factor=2, # 每个 worker 预读取 2 个样本
	# collate_fn=lambda x: x[0], # ⭐ 不做任何 collate
	# )

	# for batch in tqdm(loader):
	# pass
	for i in tqdm(range(len(dataset))):
	file = dataset[i]

	assert 0