Paula Leonova commited on
Commit
7055ca6
1 Parent(s): d855e09

Add a table for keywords for all uploaded text

Browse files
Files changed (1) hide show
  1. app.py +72 -52
app.py CHANGED
@@ -42,11 +42,12 @@ with st.form(key='my_form'):
42
 
43
  text_csv_expander = st.expander(label=f'Want to upload multiple texts at once? Expand to upload your text files below.', expanded=False)
44
  with text_csv_expander:
45
- st.write("Option A:")
 
46
  uploaded_text_files = st.file_uploader(label="Upload file(s) that end with the .txt suffix",
47
  accept_multiple_files=True, key = 'text_uploader',
48
  type = 'txt')
49
- st.write("Option B:")
50
  uploaded_csv_text_files = st.file_uploader(label='Upload a CSV file with columns: "title" and "text"',
51
  accept_multiple_files=False, key = 'csv_text_uploader',
52
  type = 'csv')
@@ -57,12 +58,12 @@ with st.form(key='my_form'):
57
 
58
  st.text("\n\n\n")
59
  st.markdown("##### Step 2: Enter Labels")
60
- labels = st.text_input('Enter possible topic labels, which can be either keywords and/or general themes (comma-separated):',input_labels, max_chars=1000)
61
  labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
62
 
63
  labels_csv_expander = st.expander(label=f'Prefer to upload a list of labels instead? Click here to upload your CSV file.',expanded=False)
64
  with labels_csv_expander:
65
- uploaded_labels_file = st.file_uploader("Or Choose a CSV file with one column and no header, where each cell is a separate label",
66
  key='labels_uploader')
67
 
68
  gen_keywords = st.radio(
@@ -72,16 +73,17 @@ with st.form(key='my_form'):
72
 
73
  st.text("\n\n\n")
74
  st.markdown("##### Step 3: Provide Ground Truth Labels (_Optional_)")
75
- glabels = st.text_input('If available, enter ground truth topic labels to evaluate results, otherwise leave blank (comma-separated):',input_glabels, max_chars=1000)
76
  glabels = list(set([x.strip() for x in glabels.strip().split(',') if len(x.strip()) > 0]))
77
 
78
 
79
  glabels_csv_expander = st.expander(label=f'Have a file with labels for the text? Click here to upload your CSV file.', expanded=False)
80
  with glabels_csv_expander:
81
- st.write("Option A:")
 
82
  uploaded_onetext_glabels_file = st.file_uploader("Choose a CSV file with one column and no header, where each cell is a separate label",
83
  key = 'onetext_glabels_uploader')
84
- st.write("Option B:")
85
  uploaded_multitext_glabels_file = st.file_uploader('Or Choose a CSV file with two columns "title" and "label", with the cells in the title column matching the name of the files uploaded in step #1.',
86
  key = 'multitext_glabels_uploader')
87
 
@@ -116,8 +118,10 @@ if submit_button or example_button:
116
  st.error("Enter some text to generate a summary")
117
  else:
118
 
 
119
  if uploaded_text_files is not None:
120
  st.markdown("### Text Inputs")
 
121
  file_names = []
122
  raw_texts = []
123
  for uploaded_file in uploaded_text_files:
@@ -125,63 +129,79 @@ if submit_button or example_button:
125
  raw_texts.append(text)
126
  title_file_name = uploaded_file.name.replace('.txt','')
127
  file_names.append(title_file_name)
128
- text_data = pd.DataFrame({'title': file_names,
129
  'text': raw_texts})
130
- st.dataframe(text_data.head())
131
  st.download_button(
132
  label="Download data as CSV",
133
- data=text_data.to_csv().encode('utf-8'),
134
- file_name='title_text_data.csv',
135
  mime='title_text/csv',
136
  )
 
137
 
138
 
139
  with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
140
  # For each body of text, create text chunks of a certain token size required for the transformer
141
- nested_sentences = md.create_nest_sentences(document = text_input, token_max_length = 1024)
142
- # For each chunk of sentences (within the token max)
143
- text_chunks = []
144
- for n in range(0, len(nested_sentences)):
145
- tc = " ".join(map(str, nested_sentences[n]))
146
- text_chunks.append(tc)
147
-
148
- if gen_keywords == 'Yes':
149
- st.markdown("### Top Keywords")
150
- with st.spinner("Generating keywords from text..."):
151
-
152
- kw_df = pd.DataFrame()
153
- for text_chunk in text_chunks:
 
 
 
 
 
 
 
154
  keywords_list = md.keyword_gen(kw_model, text_chunk)
155
- kw_df = kw_df.append(pd.DataFrame(keywords_list))
156
- kw_df.columns = ['keyword', 'score']
157
- top_kw_df = kw_df.groupby('keyword')['score'].max().reset_index()
 
 
 
 
 
 
 
 
 
 
158
 
159
- top_kw_df = top_kw_df.sort_values('score', ascending = False).reset_index().drop(['index'], axis=1)
160
- st.dataframe(top_kw_df.head(10))
161
 
162
- st.markdown("### Summary")
163
- with st.spinner(f'Generating summaries for {len(text_chunks)} text chunks (this may take a minute)...'):
164
-
165
- my_expander = st.expander(label=f'Expand to see intermediate summary generation details for {len(text_chunks)} text chunks')
166
- with my_expander:
167
- summary = []
168
-
169
- st.markdown("_Once the original text is broken into smaller chunks (totaling no more than 1024 tokens, \
170
- with complete sentences), each block of text is then summarized separately using BART NLI \
171
- and then combined at the very end to generate the final summary._")
172
-
173
- for num_chunk, text_chunk in enumerate(text_chunks):
174
- st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
175
- st.markdown(text_chunk)
176
-
177
- chunk_summary = md.summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens = 300, minimum_tokens = 20)
178
- summary.append(chunk_summary)
179
- st.markdown(f"###### Partial Summary {num_chunk+1}/{len(text_chunks)}")
180
- st.markdown(chunk_summary)
181
- # Combine all the summaries into a list and compress into one document, again
182
- final_summary = " \n\n".join(list(summary))
183
-
184
- st.markdown(final_summary)
185
 
186
  if len(text_input) == 0 or len(labels) == 0:
187
  st.error('Enter some text and at least one possible topic to see label predictions.')
 
42
 
43
  text_csv_expander = st.expander(label=f'Want to upload multiple texts at once? Expand to upload your text files below.', expanded=False)
44
  with text_csv_expander:
45
+ st.markdown('##### Choose one of the options below:')
46
+ st.write("__Option A:__")
47
  uploaded_text_files = st.file_uploader(label="Upload file(s) that end with the .txt suffix",
48
  accept_multiple_files=True, key = 'text_uploader',
49
  type = 'txt')
50
+ st.write("__Option B:__")
51
  uploaded_csv_text_files = st.file_uploader(label='Upload a CSV file with columns: "title" and "text"',
52
  accept_multiple_files=False, key = 'csv_text_uploader',
53
  type = 'csv')
 
58
 
59
  st.text("\n\n\n")
60
  st.markdown("##### Step 2: Enter Labels")
61
+ labels = st.text_input('Enter possible topic labels, which can be either keywords and/or general themes (comma-separated):',input_labels, max_chars=2000)
62
  labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
63
 
64
  labels_csv_expander = st.expander(label=f'Prefer to upload a list of labels instead? Click here to upload your CSV file.',expanded=False)
65
  with labels_csv_expander:
66
+ uploaded_labels_file = st.file_uploader("Choose a CSV file with one column and no header, where each cell is a separate label",
67
  key='labels_uploader')
68
 
69
  gen_keywords = st.radio(
 
73
 
74
  st.text("\n\n\n")
75
  st.markdown("##### Step 3: Provide Ground Truth Labels (_Optional_)")
76
+ glabels = st.text_input('If available, enter ground truth topic labels to evaluate results, otherwise leave blank (comma-separated):',input_glabels, max_chars=2000)
77
  glabels = list(set([x.strip() for x in glabels.strip().split(',') if len(x.strip()) > 0]))
78
 
79
 
80
  glabels_csv_expander = st.expander(label=f'Have a file with labels for the text? Click here to upload your CSV file.', expanded=False)
81
  with glabels_csv_expander:
82
+ st.markdown('##### Choose one of the options below:')
83
+ st.write("__Option A:__")
84
  uploaded_onetext_glabels_file = st.file_uploader("Choose a CSV file with one column and no header, where each cell is a separate label",
85
  key = 'onetext_glabels_uploader')
86
+ st.write("__Option B:__")
87
  uploaded_multitext_glabels_file = st.file_uploader('Or Choose a CSV file with two columns "title" and "label", with the cells in the title column matching the name of the files uploaded in step #1.',
88
  key = 'multitext_glabels_uploader')
89
 
 
118
  st.error("Enter some text to generate a summary")
119
  else:
120
 
121
+ # OPTION A:
122
  if uploaded_text_files is not None:
123
  st.markdown("### Text Inputs")
124
+ st.write('Files concatenated into a dataframe:')
125
  file_names = []
126
  raw_texts = []
127
  for uploaded_file in uploaded_text_files:
 
129
  raw_texts.append(text)
130
  title_file_name = uploaded_file.name.replace('.txt','')
131
  file_names.append(title_file_name)
132
+ text_df = pd.DataFrame({'title': file_names,
133
  'text': raw_texts})
134
+ st.dataframe(text_df.head())
135
  st.download_button(
136
  label="Download data as CSV",
137
+ data=text_df.to_csv().encode('utf-8'),
138
+ file_name='title_text.csv',
139
  mime='title_text/csv',
140
  )
141
+ # OPTION B: [TO DO: DIRECT CSV UPLOAD INSTEAD]
142
 
143
 
144
  with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
145
  # For each body of text, create text chunks of a certain token size required for the transformer
146
+
147
+ text_chunks_lib = dict()
148
+ for i in range(0, len(text_df)):
149
+ nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
150
+
151
+ # For each chunk of sentences (within the token max)
152
+ text_chunks = []
153
+ for n in range(0, len(nested_sentences)):
154
+ tc = " ".join(map(str, nested_sentences[n]))
155
+ text_chunks.append(tc)
156
+ title_entry = text_df['title'][i]
157
+ text_chunks_lib[title_entry] = text_chunks
158
+
159
+ if gen_keywords == 'Yes':
160
+ st.markdown("### Top Keywords")
161
+ with st.spinner("Generating keywords from text..."):
162
+
163
+ kw_dict = dict()
164
+ for key in text_chunks_lib:
165
+ for text_chunk in text_chunks_lib[key]:
166
  keywords_list = md.keyword_gen(kw_model, text_chunk)
167
+ kw_dict[key] = dict(keywords_list)
168
+
169
+ kw_df0 = pd.DataFrame.from_dict(kw_dict).reset_index()
170
+ kw_df0.rename(columns={'index': 'keyword'}, inplace=True)
171
+ kw_df = pd.melt(kw_df0, id_vars=['keyword'], var_name='title', value_name='score').dropna()
172
+ kw_df = kw_df[kw_df['score'] > 0.1][['title', 'keyword', 'score']].reset_index().drop(columns='index').sort_values(['title', 'score'], ascending=False)
173
+ st.dataframe(kw_df)
174
+ st.download_button(
175
+ label="Download data as CSV",
176
+ data=kw_df.to_csv().encode('utf-8'),
177
+ file_name='title_kewyords.csv',
178
+ mime='title_kewyords/csv',
179
+ )
180
 
 
 
181
 
182
+ st.markdown("### Summary")
183
+ with st.spinner(f'Generating summaries for {len(text_chunks)} text chunks (this may take a minute)...'):
184
+
185
+ my_expander = st.expander(label=f'Expand to see intermediate summary generation details for {len(text_chunks)} text chunks')
186
+ with my_expander:
187
+ summary = []
188
+
189
+ st.markdown("_Once the original text is broken into smaller chunks (totaling no more than 1024 tokens, \
190
+ with complete sentences), each block of text is then summarized separately using BART NLI \
191
+ and then combined at the very end to generate the final summary._")
192
+
193
+ for num_chunk, text_chunk in enumerate(text_chunks):
194
+ st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
195
+ st.markdown(text_chunk)
196
+
197
+ chunk_summary = md.summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens = 300, minimum_tokens = 20)
198
+ summary.append(chunk_summary)
199
+ st.markdown(f"###### Partial Summary {num_chunk+1}/{len(text_chunks)}")
200
+ st.markdown(chunk_summary)
201
+ # Combine all the summaries into a list and compress into one document, again
202
+ final_summary = " \n\n".join(list(summary))
203
+
204
+ st.markdown(final_summary)
205
 
206
  if len(text_input) == 0 or len(labels) == 0:
207
  st.error('Enter some text and at least one possible topic to see label predictions.')