import json import os def load_jsonl(json_file): with open(json_file) as f: lines = f.readlines() data = [] for line in lines: data.append(json.loads(line)) return data json_data = "/mnt/bn/xiangtai-training-data/project/VLM/data/SOLO_SFT/all_data.jsonl" json_data_new = "/mnt/bn/xiangtai-training-data/project/VLM/data/all_data_new.jsonl" image_data_path = "/mnt/bn/xiangtai-training-data/project/VLM/data/SOLO_SFT/images" new_json_data = [] a = load_jsonl(json_data) for index, i in enumerate(a): conversations = i['conversations'] if 'image' in i.keys() and not os.path.exists(os.path.join(image_data_path, i['image'])): # print("find", i) # exit() print("Missing: ",i) continue new_json_data.append(i) with open(json_data_new, 'w') as f: json.dump(new_json_data, f) # print(os.path.join(image_data_path, i['image'])) # if not os.path.exists(os.path.join(image_data_path, i['image'])): # print(i['images']) # for msg in conversations: # if "role" in msg.keys(): # print(i) # print(index) # exit() # elif 'from' in msg.keys(): # continue # elif 'value' in msg.keys(): # continue # else: # print(msg.keys) # if msg['from'] == 'human' or msg['from'] == 'user' or msg['role'] == 'user': # continue # elif msg['from'] == 'gpt' or msg['from'] == 'model' or msg['role'] == 'assistant': # continue # for item in conversations: # if type(item) is str: # print(conversations)