Steffen Slavetinsky commited on
Commit
6f196c6
1 Parent(s): 3e94b6c

add dataset and install from latest commit

Browse files
Files changed (3) hide show
  1. Dockerfile +1 -1
  2. dataset.csv +0 -0
  3. run.py +15 -66
Dockerfile CHANGED
@@ -6,7 +6,7 @@ ENV HOME=/code
6
  RUN apt install curl
7
  RUN pip install pip -U
8
 
9
- RUN pip install renumics-spotlight==1.3.0
10
 
11
  RUN pip install datasets
12
  COPY prepare.py .
 
6
  RUN apt install curl
7
  RUN pip install pip -U
8
 
9
+ RUN pip install git+https://github.com/Renumics/spotlight.git@ac8ed08d9a179c15fccfa5f0fa4a0a71fbc3dbe7
10
 
11
  RUN pip install datasets
12
  COPY prepare.py .
dataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
run.py CHANGED
@@ -1,72 +1,21 @@
1
- import pickle
2
- from renumics import spotlight
3
- import os
4
- import requests
5
  import pandas as pd
6
  from renumics import spotlight
7
- from renumics.spotlight.analysis import DataIssue
8
-
9
-
10
 
11
  if __name__ == "__main__":
12
- cache_file = "dataset_cache.pkl"
13
-
14
- if os.path.exists(cache_file):
15
- # Load dataset from cache
16
- with open(cache_file, "rb") as file:
17
- df = pickle.load(file)
18
- print("Dataset loaded from cache.")
19
-
20
-
21
- label_issue_rows = df[df["is_label_issue"]].sort_values("label_score").index.tolist()
22
- label_issue = DataIssue(
23
- severity="medium",
24
- title="label-issue",
25
- rows=label_issue_rows,
26
- description="Label issue found by cleanlab - Review and correct if necessary",
27
- )
28
-
29
- outlier_issue_row = (
30
- df[df["outlier_score"] < 0.6].sort_values("outlier_score").index.tolist()
31
- )
32
- outlier_issue = DataIssue(
33
- severity="medium",
34
- title="outlier-issue",
35
- rows=outlier_issue_row,
36
- description="Outlier score < 0.6 - Review and remove or collect more data",
37
  )
38
 
39
- near_duplicate_issue_row = (
40
- df[df["is_near_duplicate_issue"]].sort_values("near_duplicate_score").index.tolist()
41
- )
42
- near_duplicate_issue = DataIssue(
43
- severity="medium",
44
- title="near-duplicate-issue",
45
- rows=near_duplicate_issue_row,
46
- description="Near duplicate issue found by cleanlab - Review and remove if necessary",
47
- )
48
-
49
-
50
- while True:
51
- dtypes = {
52
- "image": spotlight.Image,
53
- "image_full": spotlight.Image,
54
- "embedding": spotlight.Embedding,
55
- "embedding_reduced": spotlight.Embedding,
56
- "probabilities": spotlight.Embedding,
57
- }
58
-
59
- view = spotlight.show(
60
- df.rename(columns={"fine_label_str": "label", "fine_label_prediction_str":"pred"}),
61
- dtype=dtypes,
62
- issues=[label_issue,outlier_issue,near_duplicate_issue],
63
- layout="layout.json",
64
- port=7860,
65
- host="0.0.0.0",
66
- allow_filebrowsing=False,
67
- )
68
-
69
- view.close()
70
-
71
- else:
72
- print(f"Dataset {cache_file} not found. Please run prepare.py first.")
 
 
 
 
 
1
  import pandas as pd
2
  from renumics import spotlight
 
 
 
3
 
4
  if __name__ == "__main__":
5
+ df = pd.read_csv("dataset.csv")
6
+ while True:
7
+ dtypes = {
8
+ "text_len": float,
9
+ "text_embedding": spotlight.Embedding,
10
+ }
11
+
12
+ view = spotlight.show(
13
+ df,
14
+ dtype=dtypes,
15
+ layout="layout.json",
16
+ port=7860,
17
+ host="0.0.0.0",
18
+ allow_filebrowsing=False,
 
 
 
 
 
 
 
 
 
 
 
19
  )
20
 
21
+ view.close()