MarkusStoll commited on
Commit
d50a017
1 Parent(s): 334322c
Files changed (3) hide show
  1. Dockerfile +2 -3
  2. prepare.py +22 -2
  3. run.py +23 -16
Dockerfile CHANGED
@@ -2,8 +2,6 @@ FROM python:3.9
2
 
3
  WORKDIR /code
4
  ENV HOME=/code
5
- # COPY ./requirements.txt /code/requirements.txt
6
- # RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
7
 
8
  RUN apt install curl
9
  RUN pip install pip -U
@@ -11,9 +9,10 @@ RUN pip install pip -U
11
  RUN pip install renumics-spotlight==1.2.0rc2
12
 
13
  RUN pip install datasets
 
 
14
 
15
  COPY . .
16
  RUN mkdir -p /code/.cache
17
  RUN chmod -R 777 /code
18
- RUN python prepare.py
19
  CMD ["python", "run.py"]
 
2
 
3
  WORKDIR /code
4
  ENV HOME=/code
 
 
5
 
6
  RUN apt install curl
7
  RUN pip install pip -U
 
9
  RUN pip install renumics-spotlight==1.2.0rc2
10
 
11
  RUN pip install datasets
12
+ COPY prepare.py .
13
+ RUN python prepare.py
14
 
15
  COPY . .
16
  RUN mkdir -p /code/.cache
17
  RUN chmod -R 777 /code
 
18
  CMD ["python", "run.py"]
prepare.py CHANGED
@@ -1,6 +1,8 @@
1
  import pickle
2
  import datasets
3
  import os
 
 
4
 
5
  if __name__ == "__main__":
6
  cache_file = "dataset_cache.pkl"
@@ -11,12 +13,30 @@ if __name__ == "__main__":
11
  print("Dataset loaded from cache.")
12
  else:
13
  # Load dataset using datasets.load_dataset()
14
- dataset = datasets.load_dataset("renumics/cifar100-enriched", split="train")
15
  print("Dataset loaded using datasets.load_dataset().")
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # Save dataset to cache
18
  with open(cache_file, "wb") as file:
19
- pickle.dump(dataset, file)
20
 
21
  print("Dataset saved to cache.")
22
 
 
1
  import pickle
2
  import datasets
3
  import os
4
+ import umap
5
+
6
 
7
  if __name__ == "__main__":
8
  cache_file = "dataset_cache.pkl"
 
13
  print("Dataset loaded from cache.")
14
  else:
15
  # Load dataset using datasets.load_dataset()
16
+ ds = datasets.load_dataset("renumics/mnist-outlier", split="train")
17
  print("Dataset loaded using datasets.load_dataset().")
18
 
19
+ df = ds.rename_columns({"label":"labels"}).to_pandas()
20
+ df["label_str"] = df["labels"].apply(lambda x: ds.features["label"].int2str(x))
21
+
22
+ # df = df[:1000]
23
+
24
+ # precalculate umap embeddings
25
+ df["embedding_ft_precalc"] = umap.UMAP(
26
+ n_neighbors=70, min_dist=0.5, random_state=42
27
+ ).fit_transform(df["embedding_ft"].tolist()).tolist()
28
+ print("Umap for ft done")
29
+
30
+
31
+ df["embedding_foundation_precalc"] = umap.UMAP(
32
+ n_neighbors=70, min_dist=0.5, random_state=42
33
+ ).fit_transform(df["embedding_foundation"].tolist()).tolist()
34
+
35
+ print("Umap for base done")
36
+
37
  # Save dataset to cache
38
  with open(cache_file, "wb") as file:
39
+ pickle.dump(df, file)
40
 
41
  print("Dataset saved to cache.")
42
 
run.py CHANGED
@@ -1,5 +1,4 @@
1
  import pickle
2
- import datasets
3
  from renumics import spotlight
4
  import os
5
 
@@ -8,23 +7,31 @@ if __name__ == "__main__":
8
  if os.path.exists(cache_file):
9
  # Load dataset from cache
10
  with open(cache_file, "rb") as file:
11
- dataset = pickle.load(file)
12
  print("Dataset loaded from cache.")
13
- else:
14
- # Load dataset using datasets.load_dataset()
15
- dataset = datasets.load_dataset("renumics/cifar100-enriched", split="train")
16
- print("Dataset loaded using datasets.load_dataset().")
17
 
18
- # Save dataset to cache
19
- with open(cache_file, "wb") as file:
20
- pickle.dump(dataset, file)
21
 
22
- print("Dataset saved to cache.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
 
25
- df = dataset.to_pandas()
26
- df_show = df.drop(columns=['embedding', 'probabilities'])
27
- while True:
28
- view = spotlight.show(df_show.sample(5000, random_state=1), port=7860, host="0.0.0.0",
29
- dtype={"image": spotlight.Image, "embedding_reduced": spotlight.Embedding}, allow_filebrowsing=False)
30
- view.close()
 
1
  import pickle
 
2
  from renumics import spotlight
3
  import os
4
 
 
7
  if os.path.exists(cache_file):
8
  # Load dataset from cache
9
  with open(cache_file, "rb") as file:
10
+ df = pickle.load(file)
11
  print("Dataset loaded from cache.")
 
 
 
 
12
 
 
 
 
13
 
14
+ while True:
15
+ dtypes = {
16
+ "nn_image": spotlight.Image,
17
+ "image": spotlight.Image,
18
+ "embedding_ft": spotlight.Embedding,
19
+ "embedding_foundation": spotlight.Embedding,
20
+ "embedding_ft_precalc": spotlight.Embedding,
21
+ "embedding_foundation_precalc": spotlight.Embedding,
22
+ }
23
+ view = spotlight.show(
24
+ df,
25
+ dtype=dtypes,
26
+ layout="/home/markus/Downloads/layout_ft_hf.json",
27
+ port=7860,
28
+ host="0.0.0.0",
29
+ allow_filebrowsing=False
30
+ )
31
+
32
+ view.close()
33
+
34
+ else:
35
+ print(f"Dataset {cache_file} not found. Please run prepare.py first.")
36
 
37