llava-uhd-new / scripts /summarize_data.py
ZzzHelloWorld's picture
Add files using upload-large-folder tool
c728d79 verified
import json
import os
from tqdm import tqdm
with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/llava_v1_5_mix665k.json") as f:
llava_v1_5_mix665k = json.load(f) # 665298
with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/llava_instruct_150k.json") as f:
llava_instruct_150k = json.load(f) # 157712
# Create sets of "id" fields
mix665k_ids = set()
for item in llava_v1_5_mix665k:
all_conv = ""
for cur_conversation in item["conversations"]:
all_conv += cur_conversation["value"]
mix665k_ids.add(f'{item["id"]}_{all_conv}')
instruct_150k_ids = set()
for item in llava_instruct_150k:
all_conv = ""
for cur_conversation in item["conversations"]:
all_conv += cur_conversation["value"]
instruct_150k_ids.add(f'{item["id"]}_{all_conv}')
share_gpt_ids = set()
for item in llava_v1_5_mix665k:
if "image" not in item:
all_conv = ""
for cur_conversation in item["conversations"]:
all_conv += cur_conversation["value"]
share_gpt_ids.add(f'{item["id"]}_{all_conv}') # 40688
# Get "id" fields that are in mix665k but not in instruct_150k and share_gpt
new_ids = mix665k_ids - instruct_150k_ids - share_gpt_ids # 466898
# Get "id" fields that are in mix665k but not in share_gpt
# new_ids = mix665k_ids - share_gpt_ids #624610
# import pdb; pdb.set_trace()
# Filter mix665k data based on new_ids
new_data = []
for item in llava_v1_5_mix665k:
all_conv = ""
for cur_conversation in item["conversations"]:
all_conv += cur_conversation["value"]
if f'{item["id"]}_{all_conv}' in new_ids:
new_data.append(item)
import pdb
pdb.set_trace()
with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/mixtral_instruct_135K_of_158K_V1.5.json") as f:
new_mixtral_instruct = json.load(f)
# mixtral_instruct_50K_of_80K_V1.json@
# print(len(new_data))
# for _ in new_mixtral_instruct:
# # import pdb; pdb.set_trace()
# if "coco" not in _["image"]:
# _["image"] = f"coco/train2017/{_['image']}"
# new_data.append(_)
# print(len(instruct_150k_ids))
print(len(new_data))
# for _ in tqdm(new_data):
# if "image" in _:
# if "000000442654" in _["image"]:
# all_conv = ""
# for cur_conversation in _["conversations"]:
# all_conv += cur_conversation["value"]
# # if not os.path.exists(f'/mnt/bn/vl-research/workspace/boli01/data/playground/data/{_["image"]}'):
# import pdb; pdb.set_trace()
# Write new_data to a new JSON file
with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/llava_v1_5_mix665k_minus_llava_instruct_150k_minus_sharegpt_plus_mixtral_instruct_135K_of_158K_V1.5.json", "w") as f:
json.dump(new_data, f)