Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,034 Bytes
59b2a81 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
'''
This file is to prepare the dataset in csv file following the format required by Opne-SORA
'''
import os, sys, shutil
import json
import csv
# Import files from the local folder
root_path = os.path.abspath('.')
sys.path.append(root_path)
# from curation_pipeline.prepare_bridge_v1 import read_bridge_v1
# from curation_pipeline.prepare_bridge_v2 import read_bridge_v2
def iter_dataset(dataset_path):
lists = []
for sub_folder_name in os.listdir(dataset_path):
sub_folder_path = os.path.join(dataset_path, sub_folder_name)
# Check number of frames
max_length = len(os.listdir(sub_folder_path))
for check_idx in range(max_length):
if not os.path.exists(os.path.join(sub_folder_path, 'im_' + str(check_idx) + '.jpg')): # Should be sequentially exists
break
num_frames = check_idx
# Read the text
txt_path = os.path.join(sub_folder_path, "lang.txt")
f = open(txt_path, "r")
lang_prompt = f.readline()
lists.append([sub_folder_path, lang_prompt, num_frames, 480, 640])
# break
return lists
if __name__ == "__main__":
v1_dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/sanity_check/bridge_v1_raw"
v2_dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/sanity_check/bridge_v2_raw"
store_name = "Bridge_raw.csv"
if os.path.exists(store_name):
os.remove(store_name)
# Execute
full_lists = [["path", "text", "num_frames", "height", "width"]]
v1_lists = iter_dataset(v1_dataset_path)
full_lists.extend(v1_lists)
v2_lists = iter_dataset(v2_dataset_path)
full_lists.extend(v2_lists)
print("Full length is ", len(full_lists))
# Store as csv file
with open(store_name, 'w') as outfile:
write = csv.writer(outfile)
write.writerows(full_lists)
# with open('output.jsonl', 'w') as outfile:
# for entry in JSON_file:
# json.dump(entry, outfile)
# outfile.write('\n') |