awacke1 commited on
Commit
973de77
β€’
1 Parent(s): c4cc6f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -21
app.py CHANGED
@@ -1,55 +1,55 @@
 
1
  from datasets import load_dataset
2
  raw_datasets = load_dataset("allocine")
3
- #raw_datasets.save_to_disk("awacke1/my-arrow-datasets")
4
  raw_datasets.save_to_disk("my-arrow-datasets")
5
 
6
- #raw_datasets = load_dataset("awacke1/my-arrow-datasets")
7
- #raw_datasets = load_dataset("my-arrow-datasets")
8
- #raw_datasets.cache_files
9
-
10
- #from datasets import load_dataset
11
- #dataset = load_dataset("awacke1/my-arrow-datasets")
12
-
13
-
14
  from datasets import load_from_disk
15
- #arrow_datasets_reloaded = load_from_disk("awacke1/my-arrow-datasets")
16
  arrow_datasets_reloaded = load_from_disk("my-arrow-datasets")
17
  arrow_datasets_reloaded
18
 
 
 
 
 
 
 
 
 
19
 
 
 
20
 
 
21
  for split, dataset in raw_datasets.items():
22
  dataset.to_csv(f"my-dataset-{split}.csv", index=None)
23
-
24
  data_files = {
25
  "train": "my-dataset-train.csv",
26
  "validation": "my-dataset-validation.csv",
27
  "test": "my-dataset-test.csv",
28
  }
29
-
30
  csv_datasets_reloaded = load_dataset("csv", data_files=data_files)
31
  csv_datasets_reloaded
32
 
33
 
 
34
  for split, dataset in raw_datasets.items():
35
  dataset.to_json(f"my-dataset-{split}.jsonl")
36
-
37
- for split, dataset in raw_datasets.items():
38
- dataset.to_parquet(f"my-dataset-{split}.parquet")
39
-
40
-
41
  json_data_files = {
42
  "train": "my-dataset-train.jsonl",
43
  "validation": "my-dataset-validation.jsonl",
44
  "test": "my-dataset-test.jsonl",
45
  }
 
 
 
 
 
 
46
  parquet_data_files = {
47
  "train": "my-dataset-train.parquet",
48
  "validation": "my-dataset-validation.parquet",
49
  "test": "my-dataset-test.parquet",
50
  }
51
-
52
- json_datasets_reloaded = load_dataset("json", data_files=json_data_files)
53
  parquet_datasets_reloaded = load_dataset("parquet", data_files=parquet_data_files)
54
-
55
-
 
1
+ # grab a dataset, prove we can save it
2
  from datasets import load_dataset
3
  raw_datasets = load_dataset("allocine")
 
4
  raw_datasets.save_to_disk("my-arrow-datasets")
5
 
6
+ # load dataset from disk - prove we can reload it
 
 
 
 
 
 
 
7
  from datasets import load_from_disk
 
8
  arrow_datasets_reloaded = load_from_disk("my-arrow-datasets")
9
  arrow_datasets_reloaded
10
 
11
+ # prove we can save and load public local dataset on huggingface spaces
12
+ raw_datasets.save_to_disk("awacke1/my-arrow-datasets")
13
+ arrow_datasets_reloaded = load_from_disk("awacke1/my-arrow-datasets")
14
+ awacke1_public_datasets = load_dataset("awacke1/my-arrow-datasets")
15
+ awacke1_public_datasets
16
+
17
+ #raw_datasets = load_dataset("my-arrow-datasets")
18
+ #raw_datasets.cache_files
19
 
20
+ #from datasets import load_dataset
21
+ #dataset = load_dataset("awacke1/my-arrow-datasets")
22
 
23
+ # prove we can save in CSV
24
  for split, dataset in raw_datasets.items():
25
  dataset.to_csv(f"my-dataset-{split}.csv", index=None)
 
26
  data_files = {
27
  "train": "my-dataset-train.csv",
28
  "validation": "my-dataset-validation.csv",
29
  "test": "my-dataset-test.csv",
30
  }
 
31
  csv_datasets_reloaded = load_dataset("csv", data_files=data_files)
32
  csv_datasets_reloaded
33
 
34
 
35
+ # prove we can save in JSON
36
  for split, dataset in raw_datasets.items():
37
  dataset.to_json(f"my-dataset-{split}.jsonl")
 
 
 
 
 
38
  json_data_files = {
39
  "train": "my-dataset-train.jsonl",
40
  "validation": "my-dataset-validation.jsonl",
41
  "test": "my-dataset-test.jsonl",
42
  }
43
+ json_datasets_reloaded = load_dataset("json", data_files=json_data_files)
44
+ json_datasets_reloaded
45
+
46
+ # prove we can save in Parquet
47
+ for split, dataset in raw_datasets.items():
48
+ dataset.to_parquet(f"my-dataset-{split}.parquet")
49
  parquet_data_files = {
50
  "train": "my-dataset-train.parquet",
51
  "validation": "my-dataset-validation.parquet",
52
  "test": "my-dataset-test.parquet",
53
  }
 
 
54
  parquet_datasets_reloaded = load_dataset("parquet", data_files=parquet_data_files)
55
+ parquet_datasets_reloaded