SaulLu commited on
Commit
a06494a
1 Parent(s): ac7a630

initialize app

Browse files
Files changed (2) hide show
  1. app.py +295 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pprint as pp
3
+ from collections import OrderedDict, defaultdict
4
+
5
+ import diff_viewer
6
+ import pandas as pd
7
+ import streamlit as st
8
+ from datasets import load_from_disk
9
+
10
+ DATASET_DIR_PATH_BEFORE_CLEAN_SELECT = os.getenv("DATASET_DIR_PATH_BEFORE_CLEAN_SELECT")
11
+ OPERATION_TYPES = [
12
+ "Applied filter",
13
+ "Applied deduplication function",
14
+ "Applied map function",
15
+ ]
16
+ MAX_LEN_DS_CHECKS = os.getenv("MAX_LEN_DS_CHECKS")
17
+
18
+
19
+ def get_ds(ds_path):
20
+ ds = load_from_disk(ds_path)
21
+ return ds
22
+
23
+
24
+ def next_idx(idx: int):
25
+ idx += 1
26
+ return idx % len(st.session_state["ds"])
27
+
28
+
29
+ def previous_idx(idx: int):
30
+ idx -= 1
31
+ return idx % len(st.session_state["ds"])
32
+
33
+
34
+ def on_click_next():
35
+ st.session_state["idx_1"] = next_idx(st.session_state["idx_1"])
36
+ st.session_state["idx_2"] = next_idx(st.session_state["idx_2"])
37
+
38
+
39
+ def on_click_previous():
40
+ st.session_state["idx_1"] = previous_idx(st.session_state["idx_1"])
41
+ st.session_state["idx_2"] = previous_idx(st.session_state["idx_2"])
42
+
43
+
44
+ def on_ds_change(ds_path):
45
+ st.session_state["ds"] = get_ds(ds_path)
46
+ st.session_state["idx_1"] = 0
47
+ st.session_state["idx_2"] = 1 if len(st.session_state["ds"]) > 1 else 0
48
+ st.session_state["ds_name"] = ds_path
49
+ st.session_state["ds_max_docs"] = len(st.session_state["ds"])
50
+
51
+
52
+ def get_log_stats_df(raw_log):
53
+ data = OrderedDict(
54
+ {
55
+ "Order": [],
56
+ "Name": [],
57
+ "Initial number of samples": [],
58
+ "Final number of samples": [],
59
+ "Initial size in bytes": [],
60
+ "Final size in bytes": [],
61
+ }
62
+ )
63
+
64
+ metric_dict = defaultdict(lambda: {})
65
+ order = 0
66
+ for line in raw_log.split("\n"):
67
+ for metric_name in list(data.keys()) + OPERATION_TYPES:
68
+
69
+ if metric_name == "Name" or metric_name == "Order":
70
+ continue
71
+
72
+ if metric_name not in line:
73
+ continue
74
+
75
+ if (
76
+ metric_name == "Removed percentage"
77
+ and "Removed percentage in bytes" in line
78
+ ):
79
+ continue
80
+
81
+ if (
82
+ metric_name == "Deduplicated percentage"
83
+ and "Deduplicated percentage in bytes" in line
84
+ ):
85
+ continue
86
+
87
+ value = line.split(metric_name)[1].split(" ")[1]
88
+
89
+ if metric_name in OPERATION_TYPES:
90
+ operation_name = value
91
+ metric_dict[operation_name]["Order"] = order
92
+ order += 1
93
+ continue
94
+
95
+ assert (
96
+ metric_name not in metric_dict[operation_name]
97
+ ), f"operation_name: {operation_name}\n\nvalue: {value}\n\nmetric_dict: {pp.pformat(metric_dict)} \n\nmetric_name: {metric_name} \n\nline: {line}"
98
+ metric_dict[operation_name][metric_name] = value
99
+ for name, data_dict in metric_dict.items():
100
+ for metric_name in data.keys():
101
+ if metric_name == "Name":
102
+ data[metric_name].append(name)
103
+ continue
104
+
105
+ data[metric_name].append(data_dict[metric_name])
106
+ df = pd.DataFrame(data)
107
+ df.rename(
108
+ {
109
+ "Initial size in bytes": "Initial size (GB)",
110
+ "Final size in bytes": "Final size (GB)",
111
+ },
112
+ axis=1,
113
+ inplace=True,
114
+ )
115
+ df["% samples removed"] = (
116
+ (
117
+ df["Initial number of samples"].astype(float)
118
+ - df["Final number of samples"].astype(float)
119
+ )
120
+ / df["Initial number of samples"].astype(float)
121
+ * 100
122
+ )
123
+ df["Size (GB) % removed"] = (
124
+ (df["Initial size (GB)"].astype(float) - df["Final size (GB)"].astype(float))
125
+ / df["Initial size (GB)"].astype(float)
126
+ * 100
127
+ )
128
+ return df
129
+
130
+
131
+ def get_logs_stats(log_path):
132
+ with open(log_path) as f:
133
+ raw_log = f.read()
134
+
135
+ try:
136
+ df = get_log_stats_df(raw_log)
137
+ st.dataframe(df)
138
+ except Exception as e:
139
+ st.write(e)
140
+ st.write("Subset of the logs:")
141
+ subcontent = [
142
+ line
143
+ for line in raw_log.split("\n")
144
+ if "INFO - __main__" in line
145
+ and "Examples of" not in line
146
+ and "Examples n°" not in line
147
+ ]
148
+ st.write(subcontent)
149
+
150
+
151
+ def meta_component(idx_key: str = "idx_1"):
152
+ if "meta" not in st.session_state["ds"][st.session_state[idx_key]]:
153
+ return
154
+
155
+ with st.expander("See meta field of the example"):
156
+ meta = st.session_state["ds"][st.session_state["idx_1"]]["meta"]
157
+ st.write(meta)
158
+
159
+
160
+ def filter_page():
161
+ index_example = st.number_input("Index of the chosen example", min_value=0, max_value=st.session_state["ds_max_docs"] -1, value=0, step=1)
162
+ st.session_state["idx_1"] = index_example
163
+ st.session_state["idx_2"] = next_idx(index_example)
164
+ idx_1 = st.session_state["idx_1"]
165
+ idx_2 = st.session_state["idx_2"]
166
+ text_1 = st.session_state["ds"][idx_1]["text"]
167
+ text_2 = st.session_state["ds"][idx_2]["text"]
168
+
169
+ st.markdown(
170
+ f"<h1 style='text-align: center'>Some examples of filtered out texts</h1>",
171
+ unsafe_allow_html=True,
172
+ )
173
+ # col_button_previous, _, col_button_next = st.columns(3)
174
+
175
+
176
+ # col_button_next.button(
177
+ # "Go to next example",
178
+ # key=None,
179
+ # help=None,
180
+ # on_click=on_click_next,
181
+ # args=None,
182
+ # kwargs=None,
183
+ # )
184
+ # col_button_previous.button(
185
+ # "Go to previous example",
186
+ # key=None,
187
+ # help=None,
188
+ # on_click=on_click_previous,
189
+ # args=None,
190
+ # kwargs=None,
191
+ # )
192
+ col_1, col_2 = st.columns(2)
193
+ with col_1:
194
+ st.subheader(f"Example n°{idx_1}")
195
+ meta_component(idx_key="idx_1")
196
+ text_1_show = text_1.replace("\n", "<br>")
197
+ st.markdown(f"<div>{text_1_show}</div>", unsafe_allow_html=True)
198
+
199
+ with col_2:
200
+ st.subheader(f"Example n°{idx_2}")
201
+ meta_component(idx_key="idx_2")
202
+ text_2_show = text_2.replace("\n", "<br>")
203
+ st.markdown(f"<div>{text_2_show}</div>", unsafe_allow_html=True)
204
+
205
+
206
+ def dedup_or_cleaning_page():
207
+ index_example = st.number_input("Index of the chosen example", min_value=0, max_value=st.session_state["ds_max_docs"] -1, value=0, step=1)
208
+ st.session_state["idx_1"] = index_example
209
+ st.session_state["idx_2"] = next_idx(index_example)
210
+
211
+ # col_button_previous, col_title, col_button_next = st.columns(3)
212
+ # col_title.markdown(
213
+ # f"<h1 style='text-align: center'>Example n°{st.session_state['idx_1']}</h1>",
214
+ # unsafe_allow_html=True,
215
+ # )
216
+ # col_button_next.button(
217
+ # "Go to next example",
218
+ # key=None,
219
+ # help=None,
220
+ # on_click=on_click_next,
221
+ # args=None,
222
+ # kwargs=None,
223
+ # )
224
+ # col_button_previous.button(
225
+ # "Go to previous example",
226
+ # key=None,
227
+ # help=None,
228
+ # on_click=on_click_previous,
229
+ # args=None,
230
+ # kwargs=None,
231
+ # )
232
+
233
+ text = st.session_state["ds"][st.session_state["idx_1"]]["text"]
234
+ old_text = st.session_state["ds"][st.session_state["idx_1"]]["old_text"]
235
+ st.markdown(
236
+ f"<h2 style='text-align: center'>Changes applied</h1>", unsafe_allow_html=True
237
+ )
238
+ col_text_1, col_text_2 = st.columns(2)
239
+ with col_text_1:
240
+ st.subheader("Old text")
241
+ with col_text_2:
242
+ st.subheader("New text")
243
+ diff_viewer.diff_viewer(old_text=old_text, new_text=text, lang="none")
244
+ meta_component(idx_key="idx_1")
245
+
246
+ with st.expander("See full old and new texts of the example"):
247
+ text_show = text.replace("\n", "<br>")
248
+ old_text_show = old_text.replace("\n", "<br>")
249
+
250
+ col_1, col_2 = st.columns(2)
251
+ with col_1:
252
+ st.subheader("Old text")
253
+ st.markdown(f"<div>{old_text_show}</div>", unsafe_allow_html=True)
254
+ with col_2:
255
+ st.subheader("New text")
256
+ st.markdown(f"<div>{text_show}</div>", unsafe_allow_html=True)
257
+
258
+
259
+ # Streamlit page
260
+ st.set_page_config(page_title="Dataset explorer", page_icon=":hugging_face:", layout="wide")
261
+ st.write(
262
+ "The purpose of this application is to sequentially view the changes made to a dataset."
263
+ )
264
+ col_option_clean, col_option_ds = st.columns(2)
265
+
266
+ CLEANING_VERSIONS = sorted(list(os.listdir(DATASET_DIR_PATH_BEFORE_CLEAN_SELECT)), reverse=True)
267
+ option_clean = col_option_clean.selectbox(
268
+ "Select the cleaning version", CLEANING_VERSIONS
269
+ )
270
+
271
+ DATASET_DIR_PATH = os.path.join(DATASET_DIR_PATH_BEFORE_CLEAN_SELECT, option_clean)
272
+ dataset_names = sorted(list(os.listdir(DATASET_DIR_PATH)))
273
+ option_ds = col_option_ds.selectbox("Select the dataset", dataset_names)
274
+
275
+ checks_path = os.path.join(DATASET_DIR_PATH, option_ds, "checks")
276
+ checks_names = sorted(list(os.listdir(checks_path)))
277
+
278
+ log_path = os.path.join(DATASET_DIR_PATH, option_ds, "logs.txt")
279
+ get_logs_stats(log_path=log_path)
280
+
281
+ option_check = st.selectbox("Select the operation applied to inspect", checks_names)
282
+ ds_path = os.path.join(checks_path, option_check)
283
+
284
+ if "ds" not in st.session_state or ds_path != st.session_state["ds_name"]:
285
+ on_ds_change(ds_path)
286
+
287
+ if len(st.session_state["ds"]) == MAX_LEN_DS_CHECKS:
288
+ st.warning(
289
+ f"Note: only a subset of size {MAX_LEN_DS_CHECKS} of the modified / filtered examples can be shown in this application"
290
+ )
291
+ with st.expander("See details of the available checks"):
292
+ st.write(st.session_state["ds"])
293
+
294
+
295
+ _ = filter_page() if "_filter_" in option_check else dedup_or_cleaning_page()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ datasets==1.17.0
2
+ pandas==1.3.5
3
+ streamlit_diff_viewer==0.0.2