Spaces:

amaai-lab
/

SonicVerse

Runtime error

App Files Files Community

SonicVerse / src /sonicverse /scripts /xclip_build_pretrain_dataset.py

annabeth97c

Initial commit

12f2e48 verified 2 days ago

raw

history blame contribute delete

2.59 kB

	from typing import List
	import random
	import argparse
	import json

	from huggingface_hub import hf_hub_download
	from datasets import Dataset

	from multi_token.constants import ROLE_ASSISTANT, ROLE_USER

	PRETRAIN_PHRASES = [
	"Repeat the content of the video <video>",
	"What is occuring in the video? <video>",
	"<video>. What happened?",
	"Convert <video> to text",
	"What is being depicted in <video>?",
	"What is the content of <video>?",
	"Describe what occurs in the video. <video>",
	"What is the video about? <video>",
	"<video>. Tell me what occurs in the video.",
	"What is the video about? <video>",
	"Give me a summary of <video>",
	"<video>. Detail what is happening in the video.",
	"Tell me about <video>",
	]


	def _timestamp_to_seconds(timestamp: str):
	parts = timestamp.split(":")
	seconds = float(parts[-1])
	seconds += float(parts[-2]) * 60
	seconds += float(parts[-3]) * 60 * 60
	return seconds


	def _write_convo(row) -> List:
	video = {
	"url": "https://www.youtube.com/watch?v=" + row["YoutubeID"],
	"start_time": _timestamp_to_seconds(row["Start_timestamp"]),
	"end_time": _timestamp_to_seconds(row["End_timestamp"]),
	}
	# test load, jk let it fail
	# load_video(video)
	example = {
	"videos": [video],
	}
	phrase = random.choice(PRETRAIN_PHRASES)
	example["messages"] = [
	{
	"role": ROLE_USER,
	"content": phrase,
	},
	{
	"role": ROLE_ASSISTANT,
	"content": row["Caption"],
	},
	]
	return example


	def main(args):
	path = hf_hub_download(
	repo_id="OpenGVLab/InternVid", filename="caption.jsonl", repo_type="dataset"
	)

	rows = []
	with open(path, "r") as f:
	for line in f:
	rows.append(json.loads(line))
	print("Dataset size:", len(rows))

	if len(rows) > args.max_examples:
	rows = random.sample(rows, k=args.max_examples)

	def gen(subset_rows):
	for row in subset_rows:
	try:
	yield _write_convo(row)
	except Exception as e:
	print(e)

	ds = Dataset.from_generator(gen, gen_kwargs={"subset_rows": rows}, num_proc=5)
	ds.save_to_disk(args.output_folder)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"-o", "--output_folder", type=str, default="/data/xclip-internvid-pretrain"
	)
	parser.add_argument("-n", "--max_examples", type=int, default=500_000)
	args = parser.parse_args()
	main(args)