MarkusStoll commited on
Commit
5ee8932
1 Parent(s): ebe1006

navigate ready

Browse files
Files changed (4) hide show
  1. Dockerfile +1 -1
  2. README.md +1 -2
  3. layout.json +39 -41
  4. run.py +28 -7
Dockerfile CHANGED
@@ -6,7 +6,7 @@ ENV HOME=/code
6
  RUN apt install curl
7
  RUN pip install pip -U
8
 
9
- RUN pip install renumics-spotlight==1.3.0rc6
10
 
11
  RUN pip install datasets
12
  COPY prepare.py .
 
6
  RUN apt install curl
7
  RUN pip install pip -U
8
 
9
+ RUN pip install renumics-spotlight==1.3.0rc7 httpx
10
 
11
  RUN pip install datasets
12
  COPY prepare.py .
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Cleanlab CIFAR-100 with Spotlight
3
  emoji: 🧐
4
  colorFrom: gray
5
  colorTo: blue
@@ -14,7 +14,6 @@ tags:
14
  - renumics
15
  - spotlight
16
  - EDA
17
- duplicated_from: renumics/cifar10-cleanlab
18
  ---
19
 
20
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Navigate Cleanlab Data Issues in CIFAR-100 with Spotlight
3
  emoji: 🧐
4
  colorFrom: gray
5
  colorTo: blue
 
14
  - renumics
15
  - spotlight
16
  - EDA
 
17
  ---
18
 
19
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
layout.json CHANGED
@@ -3,12 +3,12 @@
3
  "children": [
4
  {
5
  "kind": "split",
6
- "weight": 24.37657642133775,
7
  "orientation": "vertical",
8
  "children": [
9
  {
10
  "kind": "tab",
11
- "weight": 23.652554002465973,
12
  "children": [
13
  {
14
  "kind": "widget",
@@ -17,10 +17,10 @@
17
  "config": {
18
  "tableView": "full",
19
  "visibleColumns": [
20
- "fine_label_prediction_str",
21
- "fine_label_str",
22
  "label_score",
23
  "near_duplicate_score",
 
24
  "outlier_score"
25
  ],
26
  "sorting": null,
@@ -31,7 +31,7 @@
31
  },
32
  {
33
  "kind": "tab",
34
- "weight": 11.033364782611708,
35
  "children": [
36
  {
37
  "kind": "widget",
@@ -43,7 +43,7 @@
43
  },
44
  {
45
  "kind": "tab",
46
- "weight": 38.67424218071708,
47
  "children": [
48
  {
49
  "kind": "widget",
@@ -54,50 +54,26 @@
54
  {
55
  "view": "ImageView",
56
  "columns": [
57
- "image"
58
  ],
59
- "name": "image",
60
- "key": "iW3ihwygEHg4QZv5YzJ8ww"
61
  },
62
  {
63
  "view": "TextLens",
64
  "columns": [
65
- "fine_label_str"
66
  ],
67
  "name": "view",
68
- "key": "346d7554-5395-44d6-b358-a351901cb02e"
69
  },
70
  {
71
  "view": "TextLens",
72
  "columns": [
73
- "fine_label_prediction_str"
74
  ],
75
  "name": "view",
76
- "key": "99761cf8-350a-469d-8dbc-0df7b0db4d48"
77
- },
78
- {
79
- "view": "ScalarView",
80
- "columns": [
81
- "label_score"
82
- ],
83
- "name": "view",
84
- "key": "4ae33ae9-919a-4b10-9216-cd7c9448f9ac"
85
- },
86
- {
87
- "view": "ScalarView",
88
- "columns": [
89
- "outlier_score"
90
- ],
91
- "name": "view",
92
- "key": "13fb6430-3ffc-422c-92be-243b174b9a15"
93
- },
94
- {
95
- "view": "ScalarView",
96
- "columns": [
97
- "near_duplicate_score"
98
- ],
99
- "name": "view",
100
- "key": "daf7c0b7-2185-4e50-9eb0-ffab8d1ff906"
101
  }
102
  ],
103
  "visibleColumns": 8
@@ -109,7 +85,7 @@
109
  },
110
  {
111
  "kind": "tab",
112
- "weight": 51.915353562320064,
113
  "children": [
114
  {
115
  "kind": "widget",
@@ -120,16 +96,38 @@
120
  "embedding_reduced"
121
  ],
122
  "reductionMethod": null,
123
- "colorBy": "fine_label_str",
124
  "sizeBy": "is_label_issue",
125
- "filter": true,
126
  "umapNNeighbors": 20,
127
  "umapMetric": "cosine",
128
  "umapMinDist": 0.15,
129
  "pcaNormalization": null,
130
- "umapMenuLocalGlobalBalance": 0.5,
131
  "umapMenuIsAdvanced": false
132
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  }
134
  ]
135
  }
 
3
  "children": [
4
  {
5
  "kind": "split",
6
+ "weight": 44.24966799468792,
7
  "orientation": "vertical",
8
  "children": [
9
  {
10
  "kind": "tab",
11
+ "weight": 33.54784241752236,
12
  "children": [
13
  {
14
  "kind": "widget",
 
17
  "config": {
18
  "tableView": "full",
19
  "visibleColumns": [
20
+ "label",
 
21
  "label_score",
22
  "near_duplicate_score",
23
+ "pred",
24
  "outlier_score"
25
  ],
26
  "sorting": null,
 
31
  },
32
  {
33
  "kind": "tab",
34
+ "weight": 23.686809949341544,
35
  "children": [
36
  {
37
  "kind": "widget",
 
43
  },
44
  {
45
  "kind": "tab",
46
+ "weight": 42.765347633136095,
47
  "children": [
48
  {
49
  "kind": "widget",
 
54
  {
55
  "view": "ImageView",
56
  "columns": [
57
+ "full_image"
58
  ],
59
+ "name": "full_image",
60
+ "key": "7hA9fgoBXsKTCCFVYZfhRb"
61
  },
62
  {
63
  "view": "TextLens",
64
  "columns": [
65
+ "label"
66
  ],
67
  "name": "view",
68
+ "key": "a7fedf96-f36e-4836-9ffe-7b249c16db46"
69
  },
70
  {
71
  "view": "TextLens",
72
  "columns": [
73
+ "pred"
74
  ],
75
  "name": "view",
76
+ "key": "527a66c2-b3d4-4be0-9879-8749ee4fd0ed"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  }
78
  ],
79
  "visibleColumns": 8
 
85
  },
86
  {
87
  "kind": "tab",
88
+ "weight": 55.75033200531208,
89
  "children": [
90
  {
91
  "kind": "widget",
 
96
  "embedding_reduced"
97
  ],
98
  "reductionMethod": null,
99
+ "colorBy": "label",
100
  "sizeBy": "is_label_issue",
101
+ "filter": false,
102
  "umapNNeighbors": 20,
103
  "umapMetric": "cosine",
104
  "umapMinDist": 0.15,
105
  "pcaNormalization": null,
106
+ "umapMenuLocalGlobalBalance": null,
107
  "umapMenuIsAdvanced": false
108
  }
109
+ },
110
+ {
111
+ "kind": "widget",
112
+ "name": "Scatter Plot",
113
+ "type": "scatterplot",
114
+ "config": {
115
+ "xAxisColumn": null,
116
+ "yAxisColumn": null,
117
+ "colorBy": null,
118
+ "sizeBy": null,
119
+ "filter": false
120
+ }
121
+ },
122
+ {
123
+ "kind": "widget",
124
+ "name": "Histogram",
125
+ "type": "histogram",
126
+ "config": {
127
+ "columnKey": null,
128
+ "stackByColumnKey": null,
129
+ "filter": false
130
+ }
131
  }
132
  ]
133
  }
run.py CHANGED
@@ -17,26 +17,47 @@ if __name__ == "__main__":
17
  df = pickle.load(file)
18
  print("Dataset loaded from cache.")
19
 
 
20
  label_issue_rows = df[df["is_label_issue"]].sort_values("label_score").index.tolist()
21
- label_issue = DataIssue(severity="medium", title="label-issue", rows=label_issue_rows, description="Label issue found by cleanlab")
 
 
 
 
 
22
 
23
- outlier_issue_row = df[df["outlier_score"]<0.6].sort_values("outlier_score").index.tolist()
24
- outlier_issue = DataIssue(severity="medium", title="outlier-issue", rows=outlier_issue_row, description="Outlier score < 0.6")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- near_duplicate_issue_row = df[df["is_near_duplicate_issue"]].sort_values("near_duplicate_score").index.tolist()
27
- near_duplicate_issue = DataIssue(severity="medium", title="near-duplicate-issue", rows=near_duplicate_issue_row, description="Near duplicate issue found by cleanlab")
28
 
29
- df = df.drop(["full_image"], axis=1)
30
  while True:
31
  dtypes = {
32
  "image": spotlight.Image,
 
33
  "embedding": spotlight.Embedding,
34
  "embedding_reduced": spotlight.Embedding,
35
  "probabilities": spotlight.Embedding,
36
  }
37
 
38
  view = spotlight.show(
39
- df,
40
  dtype=dtypes,
41
  issues=[label_issue,outlier_issue,near_duplicate_issue],
42
  layout="layout.json",
 
17
  df = pickle.load(file)
18
  print("Dataset loaded from cache.")
19
 
20
+
21
  label_issue_rows = df[df["is_label_issue"]].sort_values("label_score").index.tolist()
22
+ label_issue = DataIssue(
23
+ severity="medium",
24
+ title="label-issue",
25
+ rows=label_issue_rows,
26
+ description="Label issue found by cleanlab - Review and correct if necessary",
27
+ )
28
 
29
+ outlier_issue_row = (
30
+ df[df["outlier_score"] < 0.6].sort_values("outlier_score").index.tolist()
31
+ )
32
+ outlier_issue = DataIssue(
33
+ severity="medium",
34
+ title="outlier-issue",
35
+ rows=outlier_issue_row,
36
+ description="Outlier score < 0.6 - Review and remove or collect more data",
37
+ )
38
+
39
+ near_duplicate_issue_row = (
40
+ df[df["is_near_duplicate_issue"]].sort_values("near_duplicate_score").index.tolist()
41
+ )
42
+ near_duplicate_issue = DataIssue(
43
+ severity="medium",
44
+ title="near-duplicate-issue",
45
+ rows=near_duplicate_issue_row,
46
+ description="Near duplicate issue found by cleanlab - Review and remove if necessary",
47
+ )
48
 
 
 
49
 
 
50
  while True:
51
  dtypes = {
52
  "image": spotlight.Image,
53
+ "image_full": spotlight.Image,
54
  "embedding": spotlight.Embedding,
55
  "embedding_reduced": spotlight.Embedding,
56
  "probabilities": spotlight.Embedding,
57
  }
58
 
59
  view = spotlight.show(
60
+ df.rename(columns={"fine_label_str": "label", "fine_label_prediction_str":"pred"}),
61
  dtype=dtypes,
62
  issues=[label_issue,outlier_issue,near_duplicate_issue],
63
  layout="layout.json",