sps44 commited on
Commit
d294a27
1 Parent(s): 75d4c04

switch to parquet save

Browse files
Files changed (2) hide show
  1. prepare.py +8 -6
  2. run.py +10 -7
prepare.py CHANGED
@@ -3,20 +3,22 @@ import datasets
3
  import os
4
 
5
  if __name__ == "__main__":
6
- cache_file = "dataset_cache.pkl"
7
  if os.path.exists(cache_file):
8
  # Load dataset from cache
9
- with open(cache_file, "rb") as file:
10
- dataset = pickle.load(file)
11
  print("Dataset loaded from cache.")
12
  else:
13
  # Load dataset using datasets.load_dataset()
14
- dataset = datasets.load_dataset("renumics/cifar100-enriched", split="train")
15
  print("Dataset loaded using datasets.load_dataset().")
 
 
 
16
 
17
  # Save dataset to cache
18
- with open(cache_file, "wb") as file:
19
- pickle.dump(dataset, file)
20
 
21
  print("Dataset saved to cache.")
22
 
 
3
  import os
4
 
5
  if __name__ == "__main__":
6
+ cache_file = "dataset_cache.parquet"
7
  if os.path.exists(cache_file):
8
  # Load dataset from cache
9
+ df = pd.read_parquet(cache_file)
 
10
  print("Dataset loaded from cache.")
11
  else:
12
  # Load dataset using datasets.load_dataset()
13
+ dataset = datasets.load_dataset("renumics/cifar100-enriched", split="test")
14
  print("Dataset loaded using datasets.load_dataset().")
15
+
16
+ df = dataset.to_pandas()
17
+
18
 
19
  # Save dataset to cache
20
+ #save df as parquet
21
+ df.to_parquet(cache_file)
22
 
23
  print("Dataset saved to cache.")
24
 
run.py CHANGED
@@ -4,25 +4,28 @@ from renumics import spotlight
4
  import os
5
 
6
  if __name__ == "__main__":
7
- cache_file = "dataset_cache.pkl"
8
  if os.path.exists(cache_file):
9
  # Load dataset from cache
10
- with open(cache_file, "rb") as file:
11
- dataset = pickle.load(file)
12
  print("Dataset loaded from cache.")
13
  else:
14
  # Load dataset using datasets.load_dataset()
15
- dataset = datasets.load_dataset("renumics/cifar100-enriched", split="train")
16
  print("Dataset loaded using datasets.load_dataset().")
 
 
 
17
 
18
  # Save dataset to cache
19
- with open(cache_file, "wb") as file:
20
- pickle.dump(dataset, file)
21
 
22
  print("Dataset saved to cache.")
23
 
24
 
25
- df = dataset.to_pandas()
26
  df_show = df.drop(columns=['embedding', 'probabilities'])
27
  while True:
28
  view = spotlight.show(df_show.sample(5000, random_state=1), port=7860, host="0.0.0.0",
 
4
  import os
5
 
6
  if __name__ == "__main__":
7
+ cache_file = "dataset_cache.parquet"
8
  if os.path.exists(cache_file):
9
  # Load dataset from cache
10
+ df = pd.read_parquet(cache_file)
11
+
12
  print("Dataset loaded from cache.")
13
  else:
14
  # Load dataset using datasets.load_dataset()
15
+ dataset = datasets.load_dataset("renumics/cifar100-enriched", split="test")
16
  print("Dataset loaded using datasets.load_dataset().")
17
+
18
+ df = dataset.to_pandas()
19
+
20
 
21
  # Save dataset to cache
22
+ #save df as parquet
23
+ df.to_parquet(cache_file)
24
 
25
  print("Dataset saved to cache.")
26
 
27
 
28
+ #df = dataset.to_pandas()
29
  df_show = df.drop(columns=['embedding', 'probabilities'])
30
  while True:
31
  view = spotlight.show(df_show.sample(5000, random_state=1), port=7860, host="0.0.0.0",