pile-v2-eda / load_dataset.py
Reshinth Adithyan
Update dataset splits
f8a2041
import datasets
import logging
import os
from tqdm import tqdm
PATH = "/Users/reshinthadithyan/master/research/code-research/carperai/pile-v2-small-filtered/data"
dataset_subs = os.listdir(PATH)
print(dataset_subs)
for ds in tqdm(dataset_subs):
try:
print(ds)
dataset = datasets.load_dataset("CarperAI/pile-v2-small-filtered",data_files=f"data/{ds}/data.json", split="train")
dataset.save_to_disk(f"cache_ds/{ds}")
except:
print(f"Error at {ds}")