zhouyik's picture
Upload folder using huggingface_hub
032e687 verified
import json
import os
def load_jsonl(json_file):
with open(json_file) as f:
lines = f.readlines()
data = []
for line in lines:
data.append(json.loads(line))
return data
json_data = "/mnt/bn/xiangtai-training-data/project/VLM/data/SOLO_SFT/all_data.jsonl"
json_data_new = "/mnt/bn/xiangtai-training-data/project/VLM/data/all_data_new.jsonl"
image_data_path = "/mnt/bn/xiangtai-training-data/project/VLM/data/SOLO_SFT/images"
new_json_data = []
a = load_jsonl(json_data)
for index, i in enumerate(a):
conversations = i['conversations']
if 'image' in i.keys() and not os.path.exists(os.path.join(image_data_path, i['image'])):
# print("find", i)
# exit()
print("Missing: ",i)
continue
new_json_data.append(i)
with open(json_data_new, 'w') as f:
json.dump(new_json_data, f)
# print(os.path.join(image_data_path, i['image']))
# if not os.path.exists(os.path.join(image_data_path, i['image'])):
# print(i['images'])
# for msg in conversations:
# if "role" in msg.keys():
# print(i)
# print(index)
# exit()
# elif 'from' in msg.keys():
# continue
# elif 'value' in msg.keys():
# continue
# else:
# print(msg.keys)
# if msg['from'] == 'human' or msg['from'] == 'user' or msg['role'] == 'user':
# continue
# elif msg['from'] == 'gpt' or msg['from'] == 'model' or msg['role'] == 'assistant':
# continue
# for item in conversations:
# if type(item) is str:
# print(conversations)