# grab a dataset, prove we can save it from datasets import load_dataset raw_datasets = load_dataset("allocine") raw_datasets.save_to_disk("my-arrow-datasets") # load dataset from disk - prove we can reload it from datasets import load_from_disk arrow_datasets_reloaded = load_from_disk("my-arrow-datasets") arrow_datasets_reloaded #dataset_dict.save_to_disk("../data/wikipedia_rank_nocache") raw_datasets.save_to_disk("../data/awacke1=allocine") #prove the cache arrow_datasets_reloaded.cache_files # prove we can save in CSV for split, dataset in raw_datasets.items(): dataset.to_csv(f"my-dataset-{split}.csv", index=None) data_files = { "train": "my-dataset-train.csv", "validation": "my-dataset-validation.csv", "test": "my-dataset-test.csv", } csv_datasets_reloaded = load_dataset("csv", data_files=data_files) csv_datasets_reloaded # prove we can save in JSON for split, dataset in raw_datasets.items(): dataset.to_json(f"my-dataset-{split}.jsonl") json_data_files = { "train": "my-dataset-train.jsonl", "validation": "my-dataset-validation.jsonl", "test": "my-dataset-test.jsonl", } json_datasets_reloaded = load_dataset("json", data_files=json_data_files) json_datasets_reloaded # prove we can save in Parquet for split, dataset in raw_datasets.items(): dataset.to_parquet(f"my-dataset-{split}.parquet") parquet_data_files = { "train": "my-dataset-train.parquet", "validation": "my-dataset-validation.parquet", "test": "my-dataset-test.parquet", } parquet_datasets_reloaded = load_dataset("parquet", data_files=parquet_data_files) parquet_datasets_reloaded # prove we can save and load public local dataset on huggingface spaces raw_datasets.save_to_disk("awacke1/my-arrow-datasets") arrow_datasets_reloaded = load_from_disk("awacke1/my-arrow-datasets") thisworked="Yes really worked" arrow_datasets_reloaded thisworked #awacke1_public_datasets = load_dataset("awacke1/my-arrow-datasets") #awacke1_public_datasets