sps44 commited on
Commit
6398c9c
1 Parent(s): 2a6b5a3

parquet save

Browse files
Files changed (2) hide show
  1. prepare.py +4 -5
  2. run.py +5 -6
prepare.py CHANGED
@@ -1,13 +1,13 @@
1
  import pickle
2
  import datasets
3
  import os
 
4
 
5
  if __name__ == "__main__":
6
- cache_file = "dataset_cache.pkl"
7
  if os.path.exists(cache_file):
8
  # Load dataset from cache
9
- with open(cache_file, "rb") as file:
10
- dataset = pickle.load(file)
11
  print("Dataset loaded from cache.")
12
  else:
13
  # Load dataset using datasets.load_dataset()
@@ -15,8 +15,7 @@ if __name__ == "__main__":
15
  print("Dataset loaded using datasets.load_dataset().")
16
 
17
  # Save dataset to cache
18
- with open(cache_file, "wb") as file:
19
- pickle.dump(dataset, file)
20
 
21
  print("Dataset saved to cache.")
22
 
 
1
  import pickle
2
  import datasets
3
  import os
4
+ import pandas as pd
5
 
6
  if __name__ == "__main__":
7
+ cache_file = "dataset_cache.parquet"
8
  if os.path.exists(cache_file):
9
  # Load dataset from cache
10
+ df = pd.read_parquet(cache_file)
 
11
  print("Dataset loaded from cache.")
12
  else:
13
  # Load dataset using datasets.load_dataset()
 
15
  print("Dataset loaded using datasets.load_dataset().")
16
 
17
  # Save dataset to cache
18
+ df.to_parquet(cache_file)
 
19
 
20
  print("Dataset saved to cache.")
21
 
run.py CHANGED
@@ -2,13 +2,13 @@ import pickle
2
  import datasets
3
  from renumics import spotlight
4
  import os
 
5
 
6
  if __name__ == "__main__":
7
- cache_file = "dataset_cache.pkl"
8
  if os.path.exists(cache_file):
9
  # Load dataset from cache
10
- with open(cache_file, "rb") as file:
11
- dataset = pickle.load(file)
12
  print("Dataset loaded from cache.")
13
  else:
14
  # Load dataset using datasets.load_dataset()
@@ -16,13 +16,12 @@ if __name__ == "__main__":
16
  print("Dataset loaded using datasets.load_dataset().")
17
 
18
  # Save dataset to cache
19
- with open(cache_file, "wb") as file:
20
- pickle.dump(dataset, file)
21
 
22
  print("Dataset saved to cache.")
23
 
24
 
25
- df = dataset.to_pandas()
26
  df_show = df.drop(columns=['embedding', 'probabilities'])
27
  while True:
28
  view = spotlight.show(df_show.sample(5000, random_state=1), port=7860, host="0.0.0.0",
 
2
  import datasets
3
  from renumics import spotlight
4
  import os
5
+ import pandas as pd
6
 
7
  if __name__ == "__main__":
8
+ cache_file = "dataset_cache.parquet"
9
  if os.path.exists(cache_file):
10
  # Load dataset from cache
11
+ df = pd.read_parquet(cache_file)
 
12
  print("Dataset loaded from cache.")
13
  else:
14
  # Load dataset using datasets.load_dataset()
 
16
  print("Dataset loaded using datasets.load_dataset().")
17
 
18
  # Save dataset to cache
19
+ df.to_parquet(cache_file)
 
20
 
21
  print("Dataset saved to cache.")
22
 
23
 
24
+ #df = dataset.to_pandas()
25
  df_show = df.drop(columns=['embedding', 'probabilities'])
26
  while True:
27
  view = spotlight.show(df_show.sample(5000, random_state=1), port=7860, host="0.0.0.0",