Pietro Lesci commited on
Commit
ca663e1
1 Parent(s): 8400e75

improve UI

Browse files
Files changed (3) hide show
  1. app.py +13 -5
  2. src/components.py +92 -6
  3. src/utils.py +17 -4
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import streamlit as st
2
 
3
- from src.components import faq, footer, form, presentation
4
  from src.utils import convert_df, get_logo, read_file
5
 
6
  # app configs
@@ -41,9 +41,18 @@ if not uploaded_fl:
41
  faq()
42
  else:
43
  df = read_file(uploaded_fl)
44
- new_df = form(df)
45
- if new_df is not None:
46
- payload = convert_df(new_df)
 
 
 
 
 
 
 
 
 
47
  st.download_button(
48
  label="Download data as CSV",
49
  data=payload,
@@ -51,6 +60,5 @@ else:
51
  mime="text/csv",
52
  )
53
 
54
-
55
  # footer
56
  footer()
 
1
  import streamlit as st
2
 
3
+ from src.components import faq, footer, form, presentation, analysis
4
  from src.utils import convert_df, get_logo, read_file
5
 
6
  # app configs
 
41
  faq()
42
  else:
43
  df = read_file(uploaded_fl)
44
+ outputs = form(df)
45
+
46
+ # change or create session state
47
+ if outputs is not None or "outputs" not in st.session_state:
48
+ st.session_state["outputs"] = outputs
49
+
50
+ # when procedure is performed
51
+ if st.session_state["outputs"] is not None:
52
+
53
+ df = analysis(st.session_state["outputs"])
54
+
55
+ payload = convert_df(df)
56
  st.download_button(
57
  label="Download data as CSV",
58
  data=payload,
 
60
  mime="text/csv",
61
  )
62
 
 
63
  # footer
64
  footer()
src/components.py CHANGED
@@ -1,4 +1,6 @@
1
  import streamlit as st
 
 
2
 
3
  from src.configs import Languages, PreprocessingConfigs, SupportedFiles
4
  from src.preprocessing import PreprocessingPipeline
@@ -7,6 +9,7 @@ from src.utils import get_col_indices
7
 
8
 
9
  def form(df):
 
10
  with st.form("Wordify form"):
11
  col1, col2, col3 = st.columns(3)
12
  cols = [""] + df.columns.tolist()
@@ -43,12 +46,16 @@ def form(df):
43
  pre_steps = st.multiselect(
44
  "Select pre-lemmatization processing steps (ordered)",
45
  options=steps_options,
46
- default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value],
 
 
47
  format_func=lambda x: x.replace("_", " ").title(),
48
  help="Select the processing steps to apply before the text is lemmatized",
49
  )
50
 
51
- lammatization_options = list(PreprocessingPipeline.lemmatization_component().keys())
 
 
52
  lemmatization_step = st.selectbox(
53
  "Select lemmatization",
54
  options=lammatization_options,
@@ -59,7 +66,10 @@ def form(df):
59
  post_steps = st.multiselect(
60
  "Select post-lemmatization processing steps (ordered)",
61
  options=steps_options,
62
- default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_POST.value],
 
 
 
63
  format_func=lambda x: x.replace("_", " ").title(),
64
  help="Select the processing steps to apply after the text is lemmatized",
65
  )
@@ -68,12 +78,21 @@ def form(df):
68
  submitted = st.form_submit_button("Submit")
69
  if submitted:
70
 
 
 
71
  # preprocess
72
  if not disable_preprocessing:
73
  with st.spinner("Step 1/4: Preprocessing text"):
74
- pipe = PreprocessingPipeline(language, pre_steps, lemmatization_step, post_steps)
 
 
75
  df = pipe.vaex_process(df, text_column)
76
-
 
 
 
 
 
77
  # prepare input
78
  with st.spinner("Step 2/4: Preparing inputs"):
79
  input_dict = input_transform(df[text_column], df[label_column])
@@ -86,7 +105,19 @@ def form(df):
86
  with st.spinner("Step 4/4: Preparing outputs"):
87
  new_df = output_transform(pos, neg)
88
 
89
- return new_df
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
 
92
  def faq():
@@ -274,3 +305,58 @@ def contacts():
274
 
275
  <iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2798.949796165441!2d9.185730115812493!3d45.450667779100726!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4786c405ae6543c9%3A0xf2bb2313b36af88c!2sVia%20Guglielmo%20R%C3%B6ntgen%2C%201%2C%2020136%20Milano%20MI!5e0!3m2!1sit!2sit!4v1569325279433!5m2!1sit!2sit" frameborder="0" style="border:0; width: 100%; height: 312px;" allowfullscreen></iframe>
276
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import time
3
+ import pandas as pd
4
 
5
  from src.configs import Languages, PreprocessingConfigs, SupportedFiles
6
  from src.preprocessing import PreprocessingPipeline
 
9
 
10
 
11
  def form(df):
12
+ st.subheader("Parameters")
13
  with st.form("Wordify form"):
14
  col1, col2, col3 = st.columns(3)
15
  cols = [""] + df.columns.tolist()
 
46
  pre_steps = st.multiselect(
47
  "Select pre-lemmatization processing steps (ordered)",
48
  options=steps_options,
49
+ default=[
50
+ steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value
51
+ ],
52
  format_func=lambda x: x.replace("_", " ").title(),
53
  help="Select the processing steps to apply before the text is lemmatized",
54
  )
55
 
56
+ lammatization_options = list(
57
+ PreprocessingPipeline.lemmatization_component().keys()
58
+ )
59
  lemmatization_step = st.selectbox(
60
  "Select lemmatization",
61
  options=lammatization_options,
 
66
  post_steps = st.multiselect(
67
  "Select post-lemmatization processing steps (ordered)",
68
  options=steps_options,
69
+ default=[
70
+ steps_options[i]
71
+ for i in PreprocessingConfigs.DEFAULT_POST.value
72
+ ],
73
  format_func=lambda x: x.replace("_", " ").title(),
74
  help="Select the processing steps to apply after the text is lemmatized",
75
  )
 
78
  submitted = st.form_submit_button("Submit")
79
  if submitted:
80
 
81
+ start_time = time.time()
82
+
83
  # preprocess
84
  if not disable_preprocessing:
85
  with st.spinner("Step 1/4: Preprocessing text"):
86
+ pipe = PreprocessingPipeline(
87
+ language, pre_steps, lemmatization_step, post_steps
88
+ )
89
  df = pipe.vaex_process(df, text_column)
90
+ else:
91
+ with st.spinner(
92
+ "Step 1/4: Preprocessing has been disabled - doing nothing"
93
+ ):
94
+ time.sleep(1.5)
95
+
96
  # prepare input
97
  with st.spinner("Step 2/4: Preparing inputs"):
98
  input_dict = input_transform(df[text_column], df[label_column])
 
105
  with st.spinner("Step 4/4: Preparing outputs"):
106
  new_df = output_transform(pos, neg)
107
 
108
+ # reset the index for the UI
109
+ new_df = new_df.reset_index(drop=True)
110
+
111
+ end_time = time.time()
112
+ meta_data = {
113
+ "vocab_size": input_dict["X"].shape[1],
114
+ "n_instances": input_dict["X"].shape[0],
115
+ "vocabulary": pd.DataFrame({"Vocabulary": input_dict["X_names"]}),
116
+ "labels": pd.DataFrame({"Labels": input_dict["y_names"]}),
117
+ "time": round(end_time - start_time),
118
+ }
119
+
120
+ return new_df, meta_data
121
 
122
 
123
  def faq():
 
305
 
306
  <iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2798.949796165441!2d9.185730115812493!3d45.450667779100726!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4786c405ae6543c9%3A0xf2bb2313b36af88c!2sVia%20Guglielmo%20R%C3%B6ntgen%2C%201%2C%2020136%20Milano%20MI!5e0!3m2!1sit!2sit!4v1569325279433!5m2!1sit!2sit" frameborder="0" style="border:0; width: 100%; height: 312px;" allowfullscreen></iframe>
307
  """
308
+
309
+
310
+ def analysis(outputs):
311
+
312
+ df, meta_data = outputs
313
+
314
+ st.subheader("Results")
315
+ st.markdown(
316
+ """
317
+ Wordify successfully run and you can now look at the results before downloading the wordified file.
318
+ In particular, you can use the slider to filter only those words that have a `Score` above (>=) a certain threshold.
319
+ For meaningful results, we suggest keeping the threshold to 0.25.
320
+ """
321
+ )
322
+
323
+ col1, col2 = st.columns([2, 1])
324
+
325
+ with col1:
326
+ threshold = st.slider(
327
+ "Select threshold",
328
+ min_value=0.0,
329
+ max_value=1.0,
330
+ step=0.01,
331
+ value=0.25,
332
+ help="To return everything, select 0.",
333
+ )
334
+ subset_df = df.loc[df["Score"] >= threshold]
335
+ st.write(subset_df)
336
+
337
+ with col2:
338
+ st.markdown("**Some info about your data**")
339
+ st.markdown(
340
+ f"""
341
+ Your input file contained {meta_data["n_instances"]:,} rows and
342
+ Wordify took {meta_data["time"]:,} seconds to run.
343
+
344
+ The total number of n-grams Wordify considered is {meta_data["vocab_size"]:,}.
345
+ With the current selected threshold on the `Score` (>={threshold}) the output contains {subset_df["Word"].nunique():,}
346
+ unique n-grams.
347
+ """
348
+ )
349
+
350
+ with st.expander("Vocabulary"):
351
+ st.markdown(
352
+ "The table below shows all candidate n-grams that Wordify considered"
353
+ )
354
+ st.write(meta_data["vocabulary"])
355
+
356
+ with st.expander("Labels"):
357
+ st.markdown(
358
+ "The table below summarizes the labels that your file contained"
359
+ )
360
+ st.write(meta_data["labels"])
361
+
362
+ return subset_df
src/utils.py CHANGED
@@ -68,7 +68,12 @@ def plot_labels_prop(data: pd.DataFrame, label_column: str):
68
 
69
  return
70
 
71
- source = data[label_column].value_counts().reset_index().rename(columns={"index": "Labels", label_column: "Counts"})
 
 
 
 
 
72
  source["Props"] = source["Counts"] / source["Counts"].sum()
73
  source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
74
 
@@ -81,7 +86,9 @@ def plot_labels_prop(data: pd.DataFrame, label_column: str):
81
  )
82
  )
83
 
84
- text = bars.mark_text(align="center", baseline="middle", dy=15).encode(text="Proportions:O")
 
 
85
 
86
  return (bars + text).properties(height=300)
87
 
@@ -93,7 +100,9 @@ def plot_nchars(data: pd.DataFrame, text_column: str):
93
  alt.Chart(source)
94
  .mark_bar()
95
  .encode(
96
- alt.X(f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")),
 
 
97
  alt.Y("count()", axis=alt.Axis(title="")),
98
  )
99
  )
@@ -103,7 +112,11 @@ def plot_nchars(data: pd.DataFrame, text_column: str):
103
 
104
  def plot_score(data: pd.DataFrame, label_col: str, label: str):
105
 
106
- source = data.loc[data[label_col] == label].sort_values("score", ascending=False).head(100)
 
 
 
 
107
 
108
  plot = (
109
  alt.Chart(source)
 
68
 
69
  return
70
 
71
+ source = (
72
+ data[label_column]
73
+ .value_counts()
74
+ .reset_index()
75
+ .rename(columns={"index": "Labels", label_column: "Counts"})
76
+ )
77
  source["Props"] = source["Counts"] / source["Counts"].sum()
78
  source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
79
 
 
86
  )
87
  )
88
 
89
+ text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
90
+ text="Proportions:O"
91
+ )
92
 
93
  return (bars + text).properties(height=300)
94
 
 
100
  alt.Chart(source)
101
  .mark_bar()
102
  .encode(
103
+ alt.X(
104
+ f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
105
+ ),
106
  alt.Y("count()", axis=alt.Axis(title="")),
107
  )
108
  )
 
112
 
113
  def plot_score(data: pd.DataFrame, label_col: str, label: str):
114
 
115
+ source = (
116
+ data.loc[data[label_col] == label]
117
+ .sort_values("score", ascending=False)
118
+ .head(100)
119
+ )
120
 
121
  plot = (
122
  alt.Chart(source)