sps44 commited on
Commit
eebf151
1 Parent(s): 6f6c475

build dataset on machine

Browse files
Files changed (4) hide show
  1. Dockerfile +3 -2
  2. cifar100-enriched-cv.parquet +0 -3
  3. prepare.py +44 -0
  4. run.py +4 -7
Dockerfile CHANGED
@@ -8,11 +8,12 @@ ENV HOME=/code
8
  RUN apt install curl
9
  RUN pip install pip -U
10
 
11
- RUN pip install renumics-spotlight pyarrow
12
 
13
- # RUN pip install datasets
14
 
15
  COPY . .
16
  RUN mkdir -p /code/.cache
17
  RUN chmod -R 777 /code
 
18
  CMD ["python", "run.py"]
 
8
  RUN apt install curl
9
  RUN pip install pip -U
10
 
11
+ RUN pip install renumics-spotlight==1.3.0rc8 pyarrow
12
 
13
+ RUN pip install datasets
14
 
15
  COPY . .
16
  RUN mkdir -p /code/.cache
17
  RUN chmod -R 777 /code
18
+ RUN python prepare.py
19
  CMD ["python", "run.py"]
cifar100-enriched-cv.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a00c5f027c054d5a697540d1396d07f70cd651ff59b6b862f81498e164279351
3
- size 37007568
 
 
 
 
prepare.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import datasets
3
+ import os
4
+ from cleanvision.imagelab import Imagelab
5
+ import pandas as pd
6
+
7
+
8
+
9
+
10
+ def cv_issues_cleanvision(df, image_name='image'):
11
+
12
+ image_paths = df['image'].to_list()
13
+ imagelab = Imagelab(filepaths=image_paths)
14
+ imagelab.find_issues()
15
+
16
+ df_cv=imagelab.issues.reset_index()
17
+
18
+ return df_cv
19
+
20
+
21
+ if __name__ == "__main__":
22
+ cache_file = "dataset_cache.parquet"
23
+ if os.path.exists(cache_file):
24
+ # Load dataset from cache
25
+ df = pd.read_parquet('cifar100-enriched-cv.parquet')
26
+ print("Dataset loaded from cache.")
27
+ else:
28
+ # Load dataset using datasets.load_dataset()
29
+ dataset = datasets.load_dataset("renumics/cifar100-enriched", split="test")
30
+ print("Dataset loaded using datasets.load_dataset().")
31
+
32
+ df = dataset.to_pandas()
33
+
34
+ df=df.reset_index(drop=True)
35
+
36
+ df_cv=cv_issues_cleanvision(df)
37
+ df = pd.concat([df, df_cv], axis=1)
38
+
39
+ # Save dataset to cache
40
+ #save df as parquet
41
+ df.to_parquet(cache_file)
42
+
43
+ print("Dataset saved to cache.")
44
+
run.py CHANGED
@@ -4,22 +4,19 @@ import pickle
4
  from renumics import spotlight
5
 
6
  if __name__ == "__main__":
7
- cache_file = "dataset_cache.pkl"
8
  issue_cache_file="sliceline.pkl"
9
 
10
  import pandas as pd
11
 
12
  #load dataframe from parquet
13
- df = pd.read_parquet('cifar100-enriched-cv.parquet')
14
 
15
- #with open(issue_cache_file, "rb") as issue_file:
16
- # issues = pickle.load(issue_file)
17
 
18
  df_show = df.drop(columns=['embedding', 'probabilities'])
19
 
20
  while True:
21
- #view = spotlight.show(df_show, port=7860, host="0.0.0.0", issues=issues, layout="sliceline-layout.json",
22
- # dtype={"image": spotlight.Image, "embedding_reduced": spotlight.Embedding}, allow_filebrowsing=False)
23
- view = spotlight.show(df_show, port=7860, host="0.0.0.0",
24
  dtype={"image": spotlight.Image, "embedding_reduced": spotlight.Embedding}, allow_filebrowsing=False)
 
25
  view.close()
 
4
  from renumics import spotlight
5
 
6
  if __name__ == "__main__":
7
+ cache_file = "dataset_cache.parquet"
8
  issue_cache_file="sliceline.pkl"
9
 
10
  import pandas as pd
11
 
12
  #load dataframe from parquet
13
+ df = pd.read_parquet(cache_file)
14
 
 
 
15
 
16
  df_show = df.drop(columns=['embedding', 'probabilities'])
17
 
18
  while True:
19
+ view = spotlight.show(df_show, port=7860, host="0.0.0.0", issues=issues, layout="sliceline-layout.json",
 
 
20
  dtype={"image": spotlight.Image, "embedding_reduced": spotlight.Embedding}, allow_filebrowsing=False)
21
+
22
  view.close()