SaulLu commited on
Commit
b7eb9ad
1 Parent(s): a06494a

update app

Browse files
Files changed (2) hide show
  1. app.py +35 -21
  2. requirements.txt +1 -1
app.py CHANGED
@@ -5,9 +5,11 @@ from collections import OrderedDict, defaultdict
5
  import diff_viewer
6
  import pandas as pd
7
  import streamlit as st
8
- from datasets import load_from_disk
 
 
 
9
 
10
- DATASET_DIR_PATH_BEFORE_CLEAN_SELECT = os.getenv("DATASET_DIR_PATH_BEFORE_CLEAN_SELECT")
11
  OPERATION_TYPES = [
12
  "Applied filter",
13
  "Applied deduplication function",
@@ -16,8 +18,8 @@ OPERATION_TYPES = [
16
  MAX_LEN_DS_CHECKS = os.getenv("MAX_LEN_DS_CHECKS")
17
 
18
 
19
- def get_ds(ds_path):
20
- ds = load_from_disk(ds_path)
21
  return ds
22
 
23
 
@@ -41,11 +43,11 @@ def on_click_previous():
41
  st.session_state["idx_2"] = previous_idx(st.session_state["idx_2"])
42
 
43
 
44
- def on_ds_change(ds_path):
45
- st.session_state["ds"] = get_ds(ds_path)
46
  st.session_state["idx_1"] = 0
47
  st.session_state["idx_2"] = 1 if len(st.session_state["ds"]) > 1 else 0
48
- st.session_state["ds_name"] = ds_path
49
  st.session_state["ds_max_docs"] = len(st.session_state["ds"])
50
 
51
 
@@ -128,10 +130,7 @@ def get_log_stats_df(raw_log):
128
  return df
129
 
130
 
131
- def get_logs_stats(log_path):
132
- with open(log_path) as f:
133
- raw_log = f.read()
134
-
135
  try:
136
  df = get_log_stats_df(raw_log)
137
  st.dataframe(df)
@@ -263,26 +262,41 @@ st.write(
263
  )
264
  col_option_clean, col_option_ds = st.columns(2)
265
 
266
- CLEANING_VERSIONS = sorted(list(os.listdir(DATASET_DIR_PATH_BEFORE_CLEAN_SELECT)), reverse=True)
 
 
 
 
 
 
 
 
 
 
 
 
267
  option_clean = col_option_clean.selectbox(
268
  "Select the cleaning version", CLEANING_VERSIONS
269
  )
270
 
271
- DATASET_DIR_PATH = os.path.join(DATASET_DIR_PATH_BEFORE_CLEAN_SELECT, option_clean)
272
- dataset_names = sorted(list(os.listdir(DATASET_DIR_PATH)))
273
  option_ds = col_option_ds.selectbox("Select the dataset", dataset_names)
274
 
275
- checks_path = os.path.join(DATASET_DIR_PATH, option_ds, "checks")
276
- checks_names = sorted(list(os.listdir(checks_path)))
277
 
278
- log_path = os.path.join(DATASET_DIR_PATH, option_ds, "logs.txt")
279
- get_logs_stats(log_path=log_path)
 
 
280
 
281
  option_check = st.selectbox("Select the operation applied to inspect", checks_names)
282
- ds_path = os.path.join(checks_path, option_check)
283
 
284
- if "ds" not in st.session_state or ds_path != st.session_state["ds_name"]:
285
- on_ds_change(ds_path)
 
 
286
 
287
  if len(st.session_state["ds"]) == MAX_LEN_DS_CHECKS:
288
  st.warning(
 
5
  import diff_viewer
6
  import pandas as pd
7
  import streamlit as st
8
+ from datasets import load_dataset, get_dataset_config_names
9
+
10
+ CHECK_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT = os.getenv("CHECK_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT")
11
+ LOGS_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT = os.getenv("LOGS_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT")
12
 
 
13
  OPERATION_TYPES = [
14
  "Applied filter",
15
  "Applied deduplication function",
 
18
  MAX_LEN_DS_CHECKS = os.getenv("MAX_LEN_DS_CHECKS")
19
 
20
 
21
+ def get_ds(config):
22
+ ds = load_dataset(CHECK_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT, config)
23
  return ds
24
 
25
 
 
43
  st.session_state["idx_2"] = previous_idx(st.session_state["idx_2"])
44
 
45
 
46
+ def on_ds_change(config):
47
+ st.session_state["ds"] = get_ds(config)
48
  st.session_state["idx_1"] = 0
49
  st.session_state["idx_2"] = 1 if len(st.session_state["ds"]) > 1 else 0
50
+ st.session_state["ds_check_config"] = config
51
  st.session_state["ds_max_docs"] = len(st.session_state["ds"])
52
 
53
 
 
130
  return df
131
 
132
 
133
+ def get_logs_stats(raw_log):
 
 
 
134
  try:
135
  df = get_log_stats_df(raw_log)
136
  st.dataframe(df)
 
262
  )
263
  col_option_clean, col_option_ds = st.columns(2)
264
 
265
+ CHECK_CONFIGS = get_dataset_config_names(CHECK_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT)
266
+
267
+ CLEANING_VERSIONS = set()
268
+ dataset_names = set()
269
+ checks_names = set()
270
+ for check_config in CHECK_CONFIGS:
271
+ cleaning_version, check_config = check_config.split("_dsname_")
272
+ dataset_name, checks_name = check_config.split("_operation_")
273
+ CLEANING_VERSIONS.add(cleaning_version)
274
+ dataset_names.add(dataset_name)
275
+ checks_names.add(checks_names)
276
+
277
+ # CLEANING_VERSIONS = sorted(list(os.listdir(DATASET_DIR_PATH_BEFORE_CLEAN_SELECT)), reverse=True)
278
  option_clean = col_option_clean.selectbox(
279
  "Select the cleaning version", CLEANING_VERSIONS
280
  )
281
 
282
+ # DATASET_DIR_PATH = os.path.join(DATASET_DIR_PATH_BEFORE_CLEAN_SELECT, option_clean)
283
+ # dataset_names = sorted(list(os.listdir(DATASET_DIR_PATH)))
284
  option_ds = col_option_ds.selectbox("Select the dataset", dataset_names)
285
 
286
+ # checks_path = os.path.join(DATASET_DIR_PATH, option_ds, "checks")
287
+ # checks_names = sorted(list(os.listdir(checks_path)))
288
 
289
+ # log_path = os.path.join(DATASET_DIR_PATH, option_ds, "logs.txt")
290
+ ds_log = load_dataset(LOGS_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT, f"{option_clean}_dsname_{option_ds}")
291
+ log = ds_log["train"][0]
292
+ get_logs_stats(log=log)
293
 
294
  option_check = st.selectbox("Select the operation applied to inspect", checks_names)
 
295
 
296
+ ds_check_config = f"{option_clean}_dsname_{option_ds}_operation_{option_check}"
297
+
298
+ if "ds" not in st.session_state or ds_check_config != st.session_state["ds_check_config"]:
299
+ on_ds_change(ds_check_config)
300
 
301
  if len(st.session_state["ds"]) == MAX_LEN_DS_CHECKS:
302
  st.warning(
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
- datasets==1.17.0
2
  pandas==1.3.5
3
  streamlit_diff_viewer==0.0.2
 
1
+ datasets==2.3.2
2
  pandas==1.3.5
3
  streamlit_diff_viewer==0.0.2