File size: 489 Bytes
f8a2041
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import datasets
import logging
import os
from tqdm import tqdm
PATH = "/Users/reshinthadithyan/master/research/code-research/carperai/pile-v2-small-filtered/data"
dataset_subs = os.listdir(PATH)

print(dataset_subs)


for ds in tqdm(dataset_subs):
    try:
        print(ds)
        dataset = datasets.load_dataset("CarperAI/pile-v2-small-filtered",data_files=f"data/{ds}/data.json", split="train")
        dataset.save_to_disk(f"cache_ds/{ds}")
    except:
        print(f"Error at {ds}")