Pietro Lesci commited on
Commit
e48d543
1 Parent(s): b748dad

format and delete old

Browse files
app.py CHANGED
@@ -1,12 +1,9 @@
1
  import streamlit as st
2
- from src.utils import get_logo
3
  from src import session_state
4
- from src.pages import (
5
- home,
6
- faq,
7
- about,
8
- )
9
  from src.configs import SupportedFiles
 
 
10
 
11
  # app configs
12
  st.set_page_config(
@@ -59,7 +56,9 @@ st.sidebar.markdown(
59
  """,
60
  unsafe_allow_html=True,
61
  )
62
- st.sidebar.info("Something not working? Consider [filing an issue](https://github.com/MilaNLProc/wordify-webapp-streamlit/issues/new)")
 
 
63
 
64
 
65
  # ==== MAIN ==== #
 
1
  import streamlit as st
2
+
3
  from src import session_state
 
 
 
 
 
4
  from src.configs import SupportedFiles
5
+ from src.pages import about, faq, home
6
+ from src.utils import get_logo
7
 
8
  # app configs
9
  st.set_page_config(
 
56
  """,
57
  unsafe_allow_html=True,
58
  )
59
+ st.sidebar.info(
60
+ "Something not working? Consider [filing an issue](https://github.com/MilaNLProc/wordify-webapp-streamlit/issues/new)"
61
+ )
62
 
63
 
64
  # ==== MAIN ==== #
main.py CHANGED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
- from src.utils import get_logo, read_file, convert_df
3
- from src.components import form, faq, presentation, footer, about
4
 
 
 
5
 
6
  # app configs
7
  st.set_page_config(
@@ -10,10 +10,10 @@ st.set_page_config(
10
  layout="centered",
11
  page_icon="./assets/logo.png",
12
  menu_items={
13
- 'Get Help': "https://github.com/MilaNLProc/wordify-webapp-streamlit/issues/new",
14
- 'Report a Bug': "https://github.com/MilaNLProc/wordify-webapp-streamlit/issues/new",
15
- 'About': about(),
16
- }
17
  )
18
 
19
  # logo
 
1
  import streamlit as st
 
 
2
 
3
+ from src.components import about, faq, footer, form, presentation
4
+ from src.utils import convert_df, get_logo, read_file
5
 
6
  # app configs
7
  st.set_page_config(
 
10
  layout="centered",
11
  page_icon="./assets/logo.png",
12
  menu_items={
13
+ "Get Help": "https://github.com/MilaNLProc/wordify-webapp-streamlit/issues/new",
14
+ "Report a Bug": "https://github.com/MilaNLProc/wordify-webapp-streamlit/issues/new",
15
+ "About": about(),
16
+ },
17
  )
18
 
19
  # logo
src/components.py CHANGED
@@ -1,7 +1,8 @@
1
  import streamlit as st
 
 
2
  from src.preprocessing import PreprocessingPipeline
3
- from src.wordifier import input_transform, wordifier, output_transform
4
- from src.configs import PreprocessingConfigs, SupportedFiles, Languages
5
 
6
 
7
  @st.experimental_memo
@@ -12,10 +13,16 @@ def form(df):
12
 
13
  cols = [""] + df.columns.tolist()
14
  label_column = st.selectbox(
15
- "Select label column", cols, index=0, help="Select the column containing the labels"
 
 
 
16
  )
17
  text_column = st.selectbox(
18
- "Select text column", cols, index=0, help="Select the column containing the text"
 
 
 
19
  )
20
  language = st.selectbox(
21
  "Select language",
@@ -31,12 +38,16 @@ def form(df):
31
  pre_steps = st.multiselect(
32
  "Select pre-lemmatization processing steps (ordered)",
33
  options=steps_options,
34
- default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value],
 
 
35
  format_func=lambda x: x.replace("_", " ").title(),
36
  help="Select the processing steps to apply before the text is lemmatized",
37
  )
38
 
39
- lammatization_options = list(PreprocessingPipeline.lemmatization_component().keys())
 
 
40
  lemmatization_step = st.selectbox(
41
  "Select lemmatization",
42
  options=lammatization_options,
@@ -47,7 +58,9 @@ def form(df):
47
  post_steps = st.multiselect(
48
  "Select post-lemmatization processing steps (ordered)",
49
  options=steps_options,
50
- default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_POST.value],
 
 
51
  format_func=lambda x: x.replace("_", " ").title(),
52
  help="Select the processing steps to apply after the text is lemmatized",
53
  )
@@ -58,7 +71,9 @@ def form(df):
58
 
59
  # preprocess
60
  with st.spinner("Step 1/4: Preprocessing text"):
61
- pipe = PreprocessingPipeline(language, pre_steps, lemmatization_step, post_steps)
 
 
62
  df = pipe.vaex_process(df, text_column)
63
 
64
  # prepare input
@@ -188,7 +203,10 @@ def presentation():
188
  """
189
  )
190
  st.table(
191
- {"text": ["A review", "Another review", "Yet another one", "etc"], "label": ["Good", "Bad", "Good", "etc"]}
 
 
 
192
  )
193
 
194
  st.subheader("Output format")
@@ -226,6 +244,7 @@ def contacts():
226
  <iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2798.949796165441!2d9.185730115812493!3d45.450667779100726!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4786c405ae6543c9%3A0xf2bb2313b36af88c!2sVia%20Guglielmo%20R%C3%B6ntgen%2C%201%2C%2020136%20Milano%20MI!5e0!3m2!1sit!2sit!4v1569325279433!5m2!1sit!2sit" frameborder="0" style="border:0; width: 100%; height: 312px;" allowfullscreen></iframe>
227
  """
228
 
 
229
  def about():
230
  return """
231
  The wordify team
 
1
  import streamlit as st
2
+
3
+ from src.configs import Languages, PreprocessingConfigs, SupportedFiles
4
  from src.preprocessing import PreprocessingPipeline
5
+ from src.wordifier import input_transform, output_transform, wordifier
 
6
 
7
 
8
  @st.experimental_memo
 
13
 
14
  cols = [""] + df.columns.tolist()
15
  label_column = st.selectbox(
16
+ "Select label column",
17
+ cols,
18
+ index=0,
19
+ help="Select the column containing the labels",
20
  )
21
  text_column = st.selectbox(
22
+ "Select text column",
23
+ cols,
24
+ index=0,
25
+ help="Select the column containing the text",
26
  )
27
  language = st.selectbox(
28
  "Select language",
 
38
  pre_steps = st.multiselect(
39
  "Select pre-lemmatization processing steps (ordered)",
40
  options=steps_options,
41
+ default=[
42
+ steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value
43
+ ],
44
  format_func=lambda x: x.replace("_", " ").title(),
45
  help="Select the processing steps to apply before the text is lemmatized",
46
  )
47
 
48
+ lammatization_options = list(
49
+ PreprocessingPipeline.lemmatization_component().keys()
50
+ )
51
  lemmatization_step = st.selectbox(
52
  "Select lemmatization",
53
  options=lammatization_options,
 
58
  post_steps = st.multiselect(
59
  "Select post-lemmatization processing steps (ordered)",
60
  options=steps_options,
61
+ default=[
62
+ steps_options[i] for i in PreprocessingConfigs.DEFAULT_POST.value
63
+ ],
64
  format_func=lambda x: x.replace("_", " ").title(),
65
  help="Select the processing steps to apply after the text is lemmatized",
66
  )
 
71
 
72
  # preprocess
73
  with st.spinner("Step 1/4: Preprocessing text"):
74
+ pipe = PreprocessingPipeline(
75
+ language, pre_steps, lemmatization_step, post_steps
76
+ )
77
  df = pipe.vaex_process(df, text_column)
78
 
79
  # prepare input
 
203
  """
204
  )
205
  st.table(
206
+ {
207
+ "text": ["A review", "Another review", "Yet another one", "etc"],
208
+ "label": ["Good", "Bad", "Good", "etc"],
209
+ }
210
  )
211
 
212
  st.subheader("Output format")
 
244
  <iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2798.949796165441!2d9.185730115812493!3d45.450667779100726!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4786c405ae6543c9%3A0xf2bb2313b36af88c!2sVia%20Guglielmo%20R%C3%B6ntgen%2C%201%2C%2020136%20Milano%20MI!5e0!3m2!1sit!2sit!4v1569325279433!5m2!1sit!2sit" frameborder="0" style="border:0; width: 100%; height: 312px;" allowfullscreen></iframe>
245
  """
246
 
247
+
248
  def about():
249
  return """
250
  The wordify team
src/configs.py CHANGED
@@ -1,4 +1,5 @@
1
  from enum import Enum
 
2
  import pandas as pd
3
 
4
 
 
1
  from enum import Enum
2
+
3
  import pandas as pd
4
 
5
 
src/pages/about.py DELETED
@@ -1,34 +0,0 @@
1
- import streamlit as st
2
-
3
-
4
- def write(*args):
5
- # ==== Contacts ==== #
6
- with st.beta_container():
7
- st.markdown("")
8
- st.markdown("")
9
- st.header(":rocket:About us")
10
-
11
- st.markdown(
12
- """
13
- You can reach out to us via email, phone, or - if you are old-fashioned - via mail
14
- """
15
- )
16
- with st.beta_expander("Contacts"):
17
-
18
- _, col2 = st.beta_columns([0.5, 3])
19
- col2.markdown(
20
- """
21
- :email: wordify@unibocconi.it
22
-
23
- :telephone_receiver: +39 02 5836 2604
24
-
25
- :postbox: Via Röntgen n. 1, Milan 20136 (ITALY)
26
- """
27
- )
28
-
29
- st.write(
30
- """
31
- <iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2798.949796165441!2d9.185730115812493!3d45.450667779100726!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4786c405ae6543c9%3A0xf2bb2313b36af88c!2sVia%20Guglielmo%20R%C3%B6ntgen%2C%201%2C%2020136%20Milano%20MI!5e0!3m2!1sit!2sit!4v1569325279433!5m2!1sit!2sit" frameborder="0" style="border:0; width: 100%; height: 312px;" allowfullscreen></iframe>
32
- """,
33
- unsafe_allow_html=True,
34
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pages/faq.py DELETED
@@ -1,126 +0,0 @@
1
- import streamlit as st
2
- from src.configs import Languages
3
-
4
-
5
- def write(*args):
6
-
7
- # ==== HOW IT WORKS ==== #
8
- with st.beta_container():
9
- st.markdown("")
10
- st.markdown("")
11
- st.markdown(
12
- """
13
- Wordify makes it easy to identify words that discriminate categories in textual data.
14
-
15
- Let's explain Wordify with an example. Imagine you are thinking about having a glass
16
- of wine :wine_glass: with your friends :man-man-girl-girl: and you have to buy a bottle.
17
- You know you like `bold`, `woody` wine but are unsure which one to choose.
18
- You wonder whether there are some words that describe each type of wine.
19
- Since you are a researcher :female-scientist: :male-scientist:, you decide to approach
20
- the problem scientifically :microscope:. That's where Wordify comes to the rescue!
21
- """
22
- )
23
- st.markdown("")
24
- st.markdown("")
25
- st.header("Steps")
26
- st.subheader("Step 1 - Prepare your data")
27
- st.markdown(
28
- """
29
- Create an Excel or CSV file with two columns for each row:
30
-
31
- - a column with the name or the label identifying a specific object or class (e.g., in our
32
- wine example above it would be the type of wine or the name of a specific brand). It is
33
- common practice naming this column `label`
34
-
35
- - a column with the text describing that specific object or class (e.g., in the wine example
36
- above it could be the description that you find on the rear of the bottle label). It is
37
- common practice naming this column `text`
38
-
39
- To have reliable results, we suggest providing at least 2000 labelled texts. If you provide
40
- less we will still wordify your file, but the results should then be taken with a grain of
41
- salt.
42
-
43
- Consider that we also support multi-language texts, therefore you'll be able to
44
- automatically discriminate between international wines, even if your preferred Italian
45
- producer does not provide you with a description written in English!
46
- """
47
- )
48
-
49
- st.subheader("Step 2 - Upload your file and Wordify!")
50
- st.markdown(
51
- """
52
- Once you have prepared your Excel or CSV file, click the "Browse File" button.
53
- Browse for your file.
54
- Choose the language of your texts (select multi-language if your file contains text in
55
- different languages).
56
- Push the "Wordify|" button, set back, and wait for wordify to do its tricks.
57
-
58
- Depending on the size of your data, the process can take from 1 minute to 5 minutes
59
- """
60
- )
61
-
62
- # ==== FAQ ==== #
63
- with st.beta_container():
64
- st.markdown("")
65
- st.markdown("")
66
- st.header(":question:Frequently Asked Questions")
67
- with st.beta_expander("What is Wordify?"):
68
- st.markdown(
69
- """
70
- Wordify is a way to find out which terms are most indicative for each of your dependent
71
- variable values.
72
- """
73
- )
74
-
75
- with st.beta_expander("What happens to my data?"):
76
- st.markdown(
77
- """
78
- Nothing. We never store the data you upload on disk: it is only kept in memory for the
79
- duration of the modeling, and then deleted. We do not retain any copies or traces of
80
- your data.
81
- """
82
- )
83
-
84
- with st.beta_expander("What input formats do you support?"):
85
- st.markdown(
86
- """
87
- The file you upload should be .xlsx, with two columns: the first should be labeled
88
- 'text' and contain all your documents (e.g., tweets, reviews, patents, etc.), one per
89
- line. The second column should be labeled 'label', and contain the dependent variable
90
- label associated with each text (e.g., rating, author gender, company, etc.).
91
- """
92
- )
93
-
94
- with st.beta_expander("How does it work?"):
95
- st.markdown(
96
- """
97
- It uses a variant of the Stability Selection algorithm
98
- [(Meinshausen and Bühlmann, 2010)](https://rss.onlinelibrary.wiley.com/doi/full/10.1111/j.1467-9868.2010.00740.x)
99
- to fit hundreds of logistic regression models on random subsets of the data, using
100
- different L1 penalties to drive as many of the term coefficients to 0. Any terms that
101
- receive a non-zero coefficient in at least 30% of all model runs can be seen as stable
102
- indicators.
103
- """
104
- )
105
-
106
- with st.beta_expander("How much data do I need?"):
107
- st.markdown(
108
- """
109
- We recommend at least 2000 instances, the more, the better. With fewer instances, the
110
- results are less replicable and reliable.
111
- """
112
- )
113
-
114
- with st.beta_expander("Is there a paper I can cite?"):
115
- st.markdown(
116
- """
117
- Yes please! Reference coming soon...
118
- """
119
- )
120
-
121
- with st.beta_expander("What languages are supported?"):
122
- st.markdown(
123
- f"""
124
- Currently we support: {", ".join([i.name for i in Languages])}.
125
- """
126
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pages/home.py DELETED
@@ -1,240 +0,0 @@
1
- from src.configs import Languages
2
- from src.utils import read_file, download_button
3
- from src.plotting import plot_labels_prop, plot_nchars, plot_score
4
- from src.preprocessing import Lemmatizer, PreprocessingPipeline, encode
5
- from src.wordifier import wordifier
6
- import streamlit as st
7
-
8
-
9
- def write(session, uploaded_file):
10
-
11
- if not uploaded_file:
12
- st.markdown(
13
- """
14
- Hi, welcome to __Wordify__! :rocket:
15
-
16
- Start by uploading a file - CSV, XLSX (avoid Strict Open XML Spreadsheet format [here](https://stackoverflow.com/questions/62800822/openpyxl-cannot-read-strict-open-xml-spreadsheet-format-userwarning-file-conta)),
17
- or PARQUET are currently supported.
18
-
19
- Once you have uploaded the file, __Wordify__ will show an interactive UI through which
20
- you'll be able to interactively decide the text preprocessing steps, their order, and
21
- proceed to Wordify your text.
22
-
23
- If you're ready, let's jump in:
24
-
25
- :point_left: upload a file via the upload widget in the sidebar!
26
-
27
- NOTE: whenever you want to reset everything, simply refresh the page.
28
- """
29
- )
30
-
31
- elif uploaded_file:
32
-
33
- # ==== 1. READ FILE ==== #
34
- with st.spinner("Reading file"):
35
- # TODO: write parser function that automatically understands format
36
- data = read_file(uploaded_file)
37
-
38
- # 2. CREATE UI TO SELECT COLUMNS
39
- col1, col2, col3 = st.beta_columns(3)
40
- with col1:
41
- language = st.selectbox("Select language", [i.name for i in Languages])
42
- with st.beta_expander("Description"):
43
- st.markdown(
44
- f"Select a language amongst those supported: {', '.join([f'`{i.name}`' for i in Languages])}. This will be used to lemmatize and remove stopwords."
45
- )
46
- with col2:
47
- cols_options = [""] + data.columns.tolist()
48
- label_column = st.selectbox(
49
- "Select label column name", cols_options, index=0
50
- )
51
- with st.beta_expander("Description"):
52
- st.markdown("Select the column containing the labels.")
53
-
54
- if label_column:
55
- plot = plot_labels_prop(data, label_column)
56
- if plot:
57
- st.altair_chart(plot, use_container_width=True)
58
-
59
- with col3:
60
- text_column = st.selectbox("Select text column name", cols_options, index=0)
61
- with st.beta_expander("Description"):
62
- st.markdown("Select the column containing the texts.")
63
-
64
- if text_column:
65
- st.altair_chart(
66
- plot_nchars(data, text_column), use_container_width=True
67
- )
68
-
69
- # ==== 2.1 CREATE UI FOR ADVANCED OPTIONS ==== #
70
- with st.beta_expander("Advanced options"):
71
-
72
- steps_options = list(PreprocessingPipeline.pipeline_components().keys())
73
-
74
- # stopwords option and
75
- col1, col2 = st.beta_columns([1, 3])
76
- with col1:
77
- st.markdown("Remove stopwords (uses Spacy vocabulary)")
78
- with col2:
79
- remove_stopwords_elem = st.empty()
80
-
81
- # lemmatization option
82
- col1, col2 = st.beta_columns([1, 3])
83
- with col1:
84
- st.markdown("Lemmatizes text (uses Spacy)")
85
- with col2:
86
- lemmatization_elem = st.empty()
87
-
88
- # pre-lemmatization cleaning steps and
89
- # post-lemmatization cleaning steps
90
- col1, col2 = st.beta_columns([1, 3])
91
- with col1:
92
- st.markdown(
93
- f"""
94
- Define a pipeline of cleaning steps that is applied before and/or after lemmatization.
95
- The available cleaning steps are:\n
96
- {", ".join([f"`{x.replace('_', ' ').title()}`" for x in steps_options])}
97
- """
98
- )
99
- with col2:
100
- pre_steps_elem = st.empty()
101
- post_steps_elem = st.empty()
102
- reset_button = st.empty()
103
-
104
- # implement reset logic
105
- if reset_button.button("Reset steps"):
106
- session.run_id += 1
107
-
108
- pre_steps = pre_steps_elem.multiselect(
109
- "Select pre-lemmatization preprocessing steps (ordered)",
110
- options=steps_options,
111
- default=steps_options,
112
- format_func=lambda x: x.replace("_", " ").title(),
113
- key=session.run_id,
114
- )
115
- post_steps = post_steps_elem.multiselect(
116
- "Select post-lemmatization processing steps (ordered)",
117
- options=steps_options,
118
- default=steps_options[-4:],
119
- format_func=lambda x: x.replace("_", " ").title(),
120
- key=session.run_id,
121
- )
122
- remove_stopwords = remove_stopwords_elem.checkbox(
123
- "Remove stopwords",
124
- value=True,
125
- key=session.run_id,
126
- )
127
- lemmatization = lemmatization_elem.checkbox(
128
- "Lemmatize text",
129
- value=True,
130
- key=session.run_id,
131
- )
132
-
133
- # show sample checkbox
134
- col1, col2 = st.beta_columns([1, 2])
135
- with col1:
136
- show_sample = st.checkbox("Show sample of preprocessed text")
137
-
138
- # initialize text preprocessor
139
- preprocessing_pipeline = PreprocessingPipeline(
140
- pre_steps=pre_steps,
141
- lemmatizer=Lemmatizer(
142
- language=language,
143
- remove_stop=remove_stopwords,
144
- lemmatization=lemmatization,
145
- ),
146
- post_steps=post_steps,
147
- )
148
-
149
- print(preprocessing_pipeline.pre_steps)
150
-
151
- # ==== 3. PROVIDE FEEDBACK ON OPTIONS ==== #
152
- if show_sample and not (label_column and text_column):
153
- st.warning("Please select `label` and `text` columns")
154
-
155
- elif show_sample and (label_column and text_column):
156
- sample_data = data.sample(5)
157
- sample_data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
158
- sample_data[text_column]
159
- ).values
160
-
161
- print(sample_data)
162
- st.table(
163
- sample_data.loc[
164
- :, [label_column, text_column, f"preprocessed_{text_column}"]
165
- ]
166
- )
167
-
168
- # ==== 4. RUN ==== #
169
- run_button = st.button("Wordify!")
170
- if run_button and not (label_column and text_column):
171
- st.warning("Please select `label` and `text` columns")
172
-
173
- elif run_button and (label_column and text_column) and not session.process:
174
-
175
- with st.spinner("Process started"):
176
- # data = data.head()
177
- data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
178
- data[text_column]
179
- ).values
180
-
181
- print(data.head())
182
-
183
- inputs = encode(data[f"preprocessed_{text_column}"], data[label_column])
184
- session.posdf, session.negdf = wordifier(**inputs)
185
- st.success("Wordified!")
186
-
187
- # session.posdf, session.negdf = process(data, text_column, label_column)
188
- session.process = True
189
-
190
- # ==== 5. RESULTS ==== #
191
- if session.process and (label_column and text_column):
192
- st.markdown("")
193
- st.markdown("")
194
- st.header("Results")
195
-
196
- # col1, col2, _ = st.beta_columns(3)
197
- col1, col2, col3 = st.beta_columns([2, 3, 3])
198
-
199
- with col1:
200
- label = st.selectbox(
201
- "Select label", data[label_column].unique().tolist()
202
- )
203
- # # with col2:
204
- # thres = st.slider(
205
- # "Select threshold",
206
- # min_value=0,
207
- # max_value=100,
208
- # step=1,
209
- # format="%f",
210
- # value=30,
211
- # )
212
- show_plots = st.checkbox("Show plots of top 100")
213
-
214
- with col2:
215
- st.subheader(f"Words __positively__ identifying label `{label}`")
216
- st.write(
217
- session.posdf[session.posdf[label_column] == label].sort_values(
218
- "score", ascending=False
219
- )
220
- )
221
- download_button(session.posdf, "positive_data")
222
- if show_plots:
223
- st.altair_chart(
224
- plot_score(session.posdf, label_column, label),
225
- use_container_width=True,
226
- )
227
-
228
- with col3:
229
- st.subheader(f"Words __negatively__ identifying label `{label}`")
230
- st.write(
231
- session.negdf[session.negdf[label_column] == label].sort_values(
232
- "score", ascending=False
233
- )
234
- )
235
- download_button(session.negdf, "negative_data")
236
- if show_plots:
237
- st.altair_chart(
238
- plot_score(session.negdf, label_column, label),
239
- use_container_width=True,
240
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/plotting.py DELETED
@@ -1,84 +0,0 @@
1
- import altair as alt
2
- import pandas as pd
3
- import streamlit as st
4
- from stqdm import stqdm
5
-
6
- stqdm.pandas()
7
-
8
-
9
- def plot_labels_prop(data: pd.DataFrame, label_column: str):
10
-
11
- unique_value_limit = 100
12
-
13
- if data[label_column].nunique() > unique_value_limit:
14
-
15
- st.warning(
16
- f"""
17
- The column you selected has more than {unique_value_limit}.
18
- Are you sure it's the right column? If it is, please note that
19
- this will impact __Wordify__ performance.
20
- """
21
- )
22
-
23
- return
24
-
25
- source = (
26
- data[label_column]
27
- .value_counts()
28
- .reset_index()
29
- .rename(columns={"index": "Labels", label_column: "Counts"})
30
- )
31
- source["Props"] = source["Counts"] / source["Counts"].sum()
32
- source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
33
-
34
- bars = (
35
- alt.Chart(source)
36
- .mark_bar()
37
- .encode(
38
- x=alt.X("Labels:O", sort="-y"),
39
- y="Counts:Q",
40
- )
41
- )
42
-
43
- text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
44
- text="Proportions:O"
45
- )
46
-
47
- return (bars + text).properties(height=300)
48
-
49
-
50
- def plot_nchars(data: pd.DataFrame, text_column: str):
51
- source = data[text_column].str.len().to_frame()
52
-
53
- plot = (
54
- alt.Chart(source)
55
- .mark_bar()
56
- .encode(
57
- alt.X(
58
- f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
59
- ),
60
- alt.Y("count()", axis=alt.Axis(title="")),
61
- )
62
- )
63
-
64
- return plot.properties(height=300)
65
-
66
-
67
- def plot_score(data: pd.DataFrame, label_col: str, label: str):
68
-
69
- source = (
70
- data.loc[data[label_col] == label]
71
- .sort_values("score", ascending=False)
72
- .head(100)
73
- )
74
-
75
- plot = (
76
- alt.Chart(source)
77
- .mark_bar()
78
- .encode(
79
- y=alt.Y("word:O", sort="-x"),
80
- x="score:Q",
81
- )
82
- )
83
-
84
- return plot.properties(height=max(30 * source.shape[0], 50))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/preprocessing.py CHANGED
@@ -6,10 +6,10 @@ from collections import OrderedDict
6
  from typing import Callable, List, Optional
7
 
8
  import pandas as pd
9
- from pandas.core.frame import DataFrame
10
  import spacy
11
  import streamlit as st
12
  import vaex
 
13
  from pandas.core.series import Series
14
  from textacy.preprocessing import make_pipeline, normalize, remove, replace
15
 
@@ -103,7 +103,9 @@ class PreprocessingPipeline:
103
  return self.post(self.lemma(self.nlp(self.pre(t))))
104
 
105
  vdf = vaex.from_pandas(df)
106
- vdf["processed_text"] = vdf.apply(fn, arguments=[vdf[text_column]], vectorize=False)
 
 
107
 
108
  return vdf.to_pandas_df()
109
 
@@ -115,7 +117,9 @@ class PreprocessingPipeline:
115
  total_steps = len(series) // 100
116
  res = []
117
  pbar = st.progress(0)
118
- for i, doc in enumerate(self.nlp.pipe(series, batch_size=500, n_process=os.cpu_count())):
 
 
119
  res.append(self.lemma(doc))
120
 
121
  if i % total_steps == 0:
 
6
  from typing import Callable, List, Optional
7
 
8
  import pandas as pd
 
9
  import spacy
10
  import streamlit as st
11
  import vaex
12
+ from pandas.core.frame import DataFrame
13
  from pandas.core.series import Series
14
  from textacy.preprocessing import make_pipeline, normalize, remove, replace
15
 
 
103
  return self.post(self.lemma(self.nlp(self.pre(t))))
104
 
105
  vdf = vaex.from_pandas(df)
106
+ vdf["processed_text"] = vdf.apply(
107
+ fn, arguments=[vdf[text_column]], vectorize=False
108
+ )
109
 
110
  return vdf.to_pandas_df()
111
 
 
117
  total_steps = len(series) // 100
118
  res = []
119
  pbar = st.progress(0)
120
+ for i, doc in enumerate(
121
+ self.nlp.pipe(series, batch_size=500, n_process=os.cpu_count())
122
+ ):
123
  res.append(self.lemma(doc))
124
 
125
  if i % total_steps == 0:
src/session_state.py DELETED
@@ -1,121 +0,0 @@
1
- """Hack to add per-session state to Streamlit.
2
-
3
- Usage
4
- -----
5
-
6
- >>> import SessionState
7
- >>>
8
- >>> session_state = SessionState.get(user_name='', favorite_color='black')
9
- >>> session_state.user_name
10
- ''
11
- >>> session_state.user_name = 'Mary'
12
- >>> session_state.favorite_color
13
- 'black'
14
-
15
- Since you set user_name above, next time your script runs this will be the
16
- result:
17
- >>> session_state = get(user_name='', favorite_color='black')
18
- >>> session_state.user_name
19
- 'Mary'
20
-
21
- """
22
- try:
23
- import streamlit.ReportThread as ReportThread
24
- from streamlit.server.Server import Server
25
- except Exception:
26
- # Streamlit >= 0.65.0
27
- import streamlit.report_thread as ReportThread
28
- from streamlit.server.server import Server
29
-
30
-
31
- class SessionState(object):
32
- def __init__(self, **kwargs):
33
- """A new SessionState object.
34
-
35
- Parameters
36
- ----------
37
- **kwargs : any
38
- Default values for the session state.
39
-
40
- Example
41
- -------
42
- >>> session_state = SessionState(user_name='', favorite_color='black')
43
- >>> session_state.user_name = 'Mary'
44
- ''
45
- >>> session_state.favorite_color
46
- 'black'
47
-
48
- """
49
- for key, val in kwargs.items():
50
- setattr(self, key, val)
51
-
52
-
53
- def get(**kwargs):
54
- """Gets a SessionState object for the current session.
55
-
56
- Creates a new object if necessary.
57
-
58
- Parameters
59
- ----------
60
- **kwargs : any
61
- Default values you want to add to the session state, if we're creating a
62
- new one.
63
-
64
- Example
65
- -------
66
- >>> session_state = get(user_name='', favorite_color='black')
67
- >>> session_state.user_name
68
- ''
69
- >>> session_state.user_name = 'Mary'
70
- >>> session_state.favorite_color
71
- 'black'
72
-
73
- Since you set user_name above, next time your script runs this will be the
74
- result:
75
- >>> session_state = get(user_name='', favorite_color='black')
76
- >>> session_state.user_name
77
- 'Mary'
78
-
79
- """
80
- # Hack to get the session object from Streamlit.
81
-
82
- ctx = ReportThread.get_report_ctx()
83
-
84
- this_session = None
85
-
86
- current_server = Server.get_current()
87
- if hasattr(current_server, "_session_infos"):
88
- # Streamlit < 0.56
89
- session_infos = Server.get_current()._session_infos.values()
90
- else:
91
- session_infos = Server.get_current()._session_info_by_id.values()
92
-
93
- for session_info in session_infos:
94
- s = session_info.session
95
- if (
96
- # Streamlit < 0.54.0
97
- (hasattr(s, "_main_dg") and s._main_dg == ctx.main_dg)
98
- or
99
- # Streamlit >= 0.54.0
100
- (not hasattr(s, "_main_dg") and s.enqueue == ctx.enqueue)
101
- or
102
- # Streamlit >= 0.65.2
103
- (
104
- not hasattr(s, "_main_dg")
105
- and s._uploaded_file_mgr == ctx.uploaded_file_mgr
106
- )
107
- ):
108
- this_session = s
109
-
110
- if this_session is None:
111
- raise RuntimeError(
112
- "Oh noes. Couldn't get your Streamlit Session object. "
113
- "Are you doing something fancy with threads?"
114
- )
115
-
116
- # Got the session object! Now let's attach some state into it.
117
-
118
- if not hasattr(this_session, "_custom_session_state"):
119
- this_session._custom_session_state = SessionState(**kwargs)
120
-
121
- return this_session._custom_session_state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/utils.py CHANGED
@@ -1,4 +1,5 @@
1
  import base64
 
2
  import altair as alt
3
  import pandas as pd
4
  import streamlit as st
@@ -7,7 +8,6 @@ from PIL import Image
7
  from .configs import SupportedFiles
8
 
9
 
10
-
11
  @st.cache
12
  def get_logo(path):
13
  return Image.open(path)
@@ -52,7 +52,12 @@ def plot_labels_prop(data: pd.DataFrame, label_column: str):
52
 
53
  return
54
 
55
- source = data[label_column].value_counts().reset_index().rename(columns={"index": "Labels", label_column: "Counts"})
 
 
 
 
 
56
  source["Props"] = source["Counts"] / source["Counts"].sum()
57
  source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
58
 
@@ -65,7 +70,9 @@ def plot_labels_prop(data: pd.DataFrame, label_column: str):
65
  )
66
  )
67
 
68
- text = bars.mark_text(align="center", baseline="middle", dy=15).encode(text="Proportions:O")
 
 
69
 
70
  return (bars + text).properties(height=300)
71
 
@@ -77,7 +84,9 @@ def plot_nchars(data: pd.DataFrame, text_column: str):
77
  alt.Chart(source)
78
  .mark_bar()
79
  .encode(
80
- alt.X(f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")),
 
 
81
  alt.Y("count()", axis=alt.Axis(title="")),
82
  )
83
  )
@@ -87,7 +96,11 @@ def plot_nchars(data: pd.DataFrame, text_column: str):
87
 
88
  def plot_score(data: pd.DataFrame, label_col: str, label: str):
89
 
90
- source = data.loc[data[label_col] == label].sort_values("score", ascending=False).head(100)
 
 
 
 
91
 
92
  plot = (
93
  alt.Chart(source)
 
1
  import base64
2
+
3
  import altair as alt
4
  import pandas as pd
5
  import streamlit as st
 
8
  from .configs import SupportedFiles
9
 
10
 
 
11
  @st.cache
12
  def get_logo(path):
13
  return Image.open(path)
 
52
 
53
  return
54
 
55
+ source = (
56
+ data[label_column]
57
+ .value_counts()
58
+ .reset_index()
59
+ .rename(columns={"index": "Labels", label_column: "Counts"})
60
+ )
61
  source["Props"] = source["Counts"] / source["Counts"].sum()
62
  source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
63
 
 
70
  )
71
  )
72
 
73
+ text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
74
+ text="Proportions:O"
75
+ )
76
 
77
  return (bars + text).properties(height=300)
78
 
 
84
  alt.Chart(source)
85
  .mark_bar()
86
  .encode(
87
+ alt.X(
88
+ f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
89
+ ),
90
  alt.Y("count()", axis=alt.Axis(title="")),
91
  )
92
  )
 
96
 
97
  def plot_score(data: pd.DataFrame, label_col: str, label: str):
98
 
99
+ source = (
100
+ data.loc[data[label_col] == label]
101
+ .sort_values("score", ascending=False)
102
+ .head(100)
103
+ )
104
 
105
  plot = (
106
  alt.Chart(source)
src/wordifier.py CHANGED
@@ -12,7 +12,9 @@ from sklearn.utils import resample
12
  from .configs import InputTransformConfigs, ModelConfigs
13
 
14
 
15
- def input_transform(text: pd.Series, labels: pd.Series, configs=InputTransformConfigs) -> Dict[str, np.ndarray]:
 
 
16
  """
17
  Encodes text in mathematical object ameanable to training algorithm
18
  """
@@ -45,7 +47,11 @@ def input_transform(text: pd.Series, labels: pd.Series, configs=InputTransformCo
45
 
46
 
47
  def wordifier(
48
- X: np.ndarray, y: np.ndarray, X_names: List[str], y_names: List[str], configs=ModelConfigs
 
 
 
 
49
  ) -> List[Tuple[str, float, str]]:
50
 
51
  n_instances, n_features = X.shape
@@ -85,7 +91,9 @@ def wordifier(
85
  )
86
 
87
  # sample indices to subsample matrix
88
- selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)
 
 
89
 
90
  # fit
91
  try:
@@ -110,20 +118,36 @@ def wordifier(
110
  neg_scores = neg_scores / configs.NUM_ITERS.value
111
 
112
  # get only active features
113
- pos_positions = np.where(pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0)
114
- neg_positions = np.where(neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0)
 
 
 
 
115
 
116
  # prepare DataFrame
117
- pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]
118
- neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]
 
 
 
 
 
 
119
 
120
  return pos, neg
121
 
122
 
123
- def output_transform(pos: List[Tuple[str, float, str]], neg: List[Tuple[str, float, str]]) -> DataFrame:
124
- posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
 
 
 
 
125
  posdf["correlation"] = "positive"
126
- negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
 
 
127
  negdf["correlation"] = "negative"
128
 
129
  output = pd.concat([posdf, negdf], ignore_index=False, axis=0)
 
12
  from .configs import InputTransformConfigs, ModelConfigs
13
 
14
 
15
+ def input_transform(
16
+ text: pd.Series, labels: pd.Series, configs=InputTransformConfigs
17
+ ) -> Dict[str, np.ndarray]:
18
  """
19
  Encodes text in mathematical object ameanable to training algorithm
20
  """
 
47
 
48
 
49
  def wordifier(
50
+ X: np.ndarray,
51
+ y: np.ndarray,
52
+ X_names: List[str],
53
+ y_names: List[str],
54
+ configs=ModelConfigs,
55
  ) -> List[Tuple[str, float, str]]:
56
 
57
  n_instances, n_features = X.shape
 
91
  )
92
 
93
  # sample indices to subsample matrix
94
+ selection = resample(
95
+ np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size
96
+ )
97
 
98
  # fit
99
  try:
 
118
  neg_scores = neg_scores / configs.NUM_ITERS.value
119
 
120
  # get only active features
121
+ pos_positions = np.where(
122
+ pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0
123
+ )
124
+ neg_positions = np.where(
125
+ neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0
126
+ )
127
 
128
  # prepare DataFrame
129
+ pos = [
130
+ (X_names[i], pos_scores[c, i], y_names[c])
131
+ for c, i in zip(*pos_positions.nonzero())
132
+ ]
133
+ neg = [
134
+ (X_names[i], neg_scores[c, i], y_names[c])
135
+ for c, i in zip(*neg_positions.nonzero())
136
+ ]
137
 
138
  return pos, neg
139
 
140
 
141
+ def output_transform(
142
+ pos: List[Tuple[str, float, str]], neg: List[Tuple[str, float, str]]
143
+ ) -> DataFrame:
144
+ posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(
145
+ ["label", "score"], ascending=False
146
+ )
147
  posdf["correlation"] = "positive"
148
+ negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(
149
+ ["label", "score"], ascending=False
150
+ )
151
  negdf["correlation"] = "negative"
152
 
153
  output = pd.concat([posdf, negdf], ignore_index=False, axis=0)