|
import json |
|
import os |
|
from tqdm import tqdm |
|
|
|
with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/llava_v1_5_mix665k.json") as f: |
|
llava_v1_5_mix665k = json.load(f) |
|
|
|
with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/llava_instruct_150k.json") as f: |
|
llava_instruct_150k = json.load(f) |
|
|
|
|
|
mix665k_ids = set() |
|
for item in llava_v1_5_mix665k: |
|
all_conv = "" |
|
for cur_conversation in item["conversations"]: |
|
all_conv += cur_conversation["value"] |
|
mix665k_ids.add(f'{item["id"]}_{all_conv}') |
|
|
|
instruct_150k_ids = set() |
|
for item in llava_instruct_150k: |
|
all_conv = "" |
|
for cur_conversation in item["conversations"]: |
|
all_conv += cur_conversation["value"] |
|
instruct_150k_ids.add(f'{item["id"]}_{all_conv}') |
|
|
|
share_gpt_ids = set() |
|
for item in llava_v1_5_mix665k: |
|
if "image" not in item: |
|
all_conv = "" |
|
for cur_conversation in item["conversations"]: |
|
all_conv += cur_conversation["value"] |
|
share_gpt_ids.add(f'{item["id"]}_{all_conv}') |
|
|
|
|
|
new_ids = mix665k_ids - instruct_150k_ids - share_gpt_ids |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
new_data = [] |
|
for item in llava_v1_5_mix665k: |
|
all_conv = "" |
|
for cur_conversation in item["conversations"]: |
|
all_conv += cur_conversation["value"] |
|
if f'{item["id"]}_{all_conv}' in new_ids: |
|
new_data.append(item) |
|
|
|
import pdb |
|
|
|
pdb.set_trace() |
|
|
|
with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/mixtral_instruct_135K_of_158K_V1.5.json") as f: |
|
new_mixtral_instruct = json.load(f) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(len(new_data)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/llava_v1_5_mix665k_minus_llava_instruct_150k_minus_sharegpt_plus_mixtral_instruct_135K_of_158K_V1.5.json", "w") as f: |
|
json.dump(new_data, f) |
|
|