|
|
|
|
|
|
|
|
|
|
|
import os |
|
from vllm import LLM, SamplingParams |
|
from vllm.utils import get_open_port |
|
import random |
|
random.seed(42) |
|
from prompt import object_recognition_prompt_miradata, prompt_miradata_based_text |
|
prompt_generate = [prompt_miradata_based_text] |
|
from transformers import AutoTokenizer |
|
import jsonlines |
|
import json |
|
from multiprocessing import Process |
|
import time |
|
import argparse |
|
import gc |
|
import torch |
|
import psutil |
|
import pdb |
|
from tqdm import tqdm |
|
|
|
def get_agrs(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--save_dir", type=str, default="/share/minghao/VideoProjects/Sythesis/LongVideoCaption/CaptionResults") |
|
parser.add_argument("--model", type=str, default="Qwen2.5-VL-72B-Instruct-AWQ") |
|
parser.add_argument("--GPUs_per_dp_rank", type=int, default=2) |
|
parser.add_argument("--DP_size", type=int, default=4) |
|
parser.add_argument("--start", type=int, default=0) |
|
parser.add_argument("--end", type=int, default=10000) |
|
parser.add_argument("--max_num_seqs", type=int, default=4) |
|
parser.add_argument("--max_model_len", type=int, default=32768) |
|
parser.add_argument("--max_tokens", type=int, default=8192) |
|
args = parser.parse_args() |
|
return args |
|
|
|
def get_have_processed(save_dir): |
|
names = os.listdir(save_dir) |
|
paths = [ os.path.join(save_dir, tmp) for tmp in names ] |
|
record_video_id = [] |
|
for path in paths: |
|
datas = load_jsonl(path) |
|
for data in datas: |
|
video_id = datas['clip_id'] |
|
if video_id in record_video_id: |
|
continue |
|
else: |
|
record_video_id.append(video_id) |
|
|
|
return record_video_id |
|
|
|
def load_jsonl(path): |
|
datas = [] |
|
|
|
with jsonlines.open(path, "r") as reader: |
|
for obj in reader: |
|
datas.append(obj) |
|
return datas |
|
|
|
def load_json(path): |
|
with open(path, "r") as reader: |
|
datas = json.load(reader) |
|
return datas |
|
|
|
def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank, data_inps): |
|
os.environ["VLLM_DP_RANK"] = str(dp_rank) |
|
os.environ["VLLM_DP_SIZE"] = str(dp_size) |
|
os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip |
|
os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port) |
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( |
|
str(i) for i in range(dp_rank * GPUs_per_dp_rank, (dp_rank + 1) * |
|
GPUs_per_dp_rank)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
promts_per_rank = len(data_inps) // dp_size |
|
start = dp_rank * promts_per_rank |
|
end = start + promts_per_rank |
|
this_data_inps = data_inps[start:end] |
|
if len(this_data_inps) == 0: |
|
|
|
|
|
this_data_inps = ["Placeholder"] |
|
print(f"DP rank {dp_rank} needs to process {len(this_data_inps)} prompts") |
|
|
|
|
|
|
|
|
|
|
|
max_tokens = args.max_tokens |
|
sampling_params = SamplingParams(temperature=0.1, top_k=20, top_p=0.8, repetition_penalty=1.05, max_tokens=max_tokens) |
|
|
|
model_name = f"/share/minghao/Models/{args.model}" |
|
max_model_len = args.max_model_len |
|
max_num_seqs = args.max_num_seqs |
|
|
|
llm = LLM(model=model_name, |
|
tensor_parallel_size=GPUs_per_dp_rank,max_model_len=max_model_len,enforce_eager=True,gpu_memory_utilization=0.9, max_num_seqs=max_num_seqs) |
|
|
|
|
|
batch_size = 2000 |
|
save_dir = args.save_dir |
|
os.makedirs(save_dir, exist_ok=True) |
|
save_name = f'{dp_rank}.jsonl' |
|
save_path = os.path.join(save_dir, save_name) |
|
with open(save_path, 'a') as file: |
|
for i in range(0, len(this_data_inps), batch_size): |
|
start = time.time() |
|
batch_this_data_inps = this_data_inps[i:i+batch_size] |
|
batch_prompts = [tmp['qa_prompt'] for tmp in batch_this_data_inps] |
|
outputs = llm.generate(batch_prompts, sampling_params) |
|
|
|
print(f'推理完成 Total Finish:{len(outputs)}') |
|
for idx, output in enumerate(outputs): |
|
this_inp = batch_this_data_inps[idx] |
|
prompt = output.prompt |
|
generated_qa = output.outputs[0].text |
|
this_inp['qa_prompt'] = prompt |
|
this_inp.update({"generated_qa": generated_qa}) |
|
file.write(json.dumps(this_inp) + "\n") |
|
file.flush() |
|
|
|
end = time.time() |
|
del batch_this_data_inps, batch_prompts, outputs |
|
gc.collect() |
|
print(f'batch time cost: {end-start}s') |
|
print(f"[Memory] CPU: {psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2:.2f} MB") |
|
print(f"[Memory] GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB") |
|
|
|
|
|
def read_all_captions(root_caption_dir, caption_file_names): |
|
caption_file_dir_list = [os.path.join(root_caption_dir, file_name) for file_name in caption_file_names] |
|
datas = [] |
|
for caption_dir in caption_file_dir_list: |
|
caption_file_names = sorted(os.listdir(caption_dir)) |
|
caption_file_paths = [os.path.join(caption_dir, name) for name in caption_file_names] |
|
for path in caption_file_paths: |
|
datas += load_jsonl(path) |
|
|
|
return datas |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
args = get_agrs() |
|
datas = load_json('/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube_31k_5_10min_filter_clips.json') |
|
|
|
print(f'Total Video Size: {len(datas)}') |
|
|
|
|
|
new_datas = [] |
|
for data in tqdm(datas): |
|
clips = data['clips'] |
|
for clip in clips: |
|
clip['clip_id'] = str(clip['clip_id']) + '_' + clip['video_id'] |
|
new_datas.extend(clips) |
|
|
|
print(f'Total Clips Size: {len(new_datas)}') |
|
datas = new_datas |
|
|
|
start = args.start |
|
end = args.end |
|
datas = datas[start:end] |
|
print(f'Start: {start}, End: {end}') |
|
print(f'to process size: {len(datas)}') |
|
|
|
save_dir = args.save_dir |
|
if os.path.exists(save_dir): |
|
have_downloaded = get_have_processed(save_dir) |
|
filter_datas = [] |
|
for data in tqdm(datas, desc='Filtering 2...'): |
|
if data['clip_id'] in have_downloaded: |
|
continue |
|
else: |
|
filter_datas.append(data) |
|
|
|
datas = filter_datas |
|
print(f'have_downloaded size : {len(have_downloaded)}') |
|
print(f'rest to process size : {len(datas)}') |
|
|
|
|
|
model_name = f"/share/minghao/Models/{args.model}" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
prompts = [] |
|
data_inps = [] |
|
for data in datas: |
|
this_prompt = random.choice(prompt_generate) |
|
dense_caption = data['dense_caption'] |
|
background_caption = data['background_caption'] |
|
main_object_caption = data['main_object_caption'] |
|
system_prompt, user_prompt = this_prompt(dense_caption, background_caption, main_object_caption) |
|
|
|
messages = [ |
|
{"role": "system", "content": system_prompt}, |
|
{"role": "user", "content": user_prompt} |
|
] |
|
|
|
prompt = tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
|
|
prompts.append(prompt) |
|
data['qa_prompt'] = prompt |
|
data_inps.append(data) |
|
|
|
print(f'Total size: {len(prompts)}') |
|
print(f'Sample show: {prompts[0]}') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|