Paula Leonova commited on
Commit
b1bf232
1 Parent(s): 009207e

Add option B of loading text csv

Browse files
Files changed (1) hide show
  1. app.py +77 -55
app.py CHANGED
@@ -46,15 +46,24 @@ with st.form(key='my_form'):
46
  st.write("__Option A:__")
47
  uploaded_text_files = st.file_uploader(label="Upload file(s) that end with the .txt suffix",
48
  accept_multiple_files=True, key = 'text_uploader',
49
- type = 'txt')
50
  st.write("__Option B:__")
51
  uploaded_csv_text_files = st.file_uploader(label='Upload a CSV file with two columns: "title" and "text"',
52
  accept_multiple_files=False, key = 'csv_text_uploader',
53
- type = 'csv')
54
 
55
  if text_input == display_text and display_text != '':
56
  text_input = example_text
57
 
 
 
 
 
 
 
 
 
 
58
 
59
  st.text("\n\n\n")
60
  st.markdown("##### Step 2: Enter Labels")
@@ -66,10 +75,11 @@ with st.form(key='my_form'):
66
  uploaded_labels_file = st.file_uploader("Choose a CSV file with one column and no header, where each cell is a separate label",
67
  key='labels_uploader')
68
 
69
- gen_keywords = st.radio(
70
- "Generate keywords from text (independent from the above labels)?",
71
- ('Yes', 'No')
72
- )
 
73
 
74
  st.text("\n\n\n")
75
  st.markdown("##### Step 3: Provide Ground Truth Labels (_Optional_)")
@@ -119,9 +129,9 @@ if submit_button or example_button:
119
  else:
120
 
121
  if len(text_input) != 0:
122
- text_df = pd.DataFrame.from_dict({'title': ['sample'], 'text': [text_input]})
123
 
124
- # OPTION A:
125
  elif uploaded_text_files is not None:
126
  st.markdown("### Text Inputs")
127
  st.write('Files concatenated into a dataframe:')
@@ -141,11 +151,9 @@ if submit_button or example_button:
141
  file_name='title_text.csv',
142
  mime='title_text/csv',
143
  )
144
- # OPTION B: [TO DO: DIRECT CSV UPLOAD INSTEAD]
145
-
146
-
147
- if len(text_input) != 0:
148
- text_df = pd.DataFrame.from_dict({'title': ['Submitted Text'], 'text': [text_input]})
149
 
150
 
151
  with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
@@ -196,71 +204,85 @@ if submit_button or example_button:
196
 
197
 
198
  st.markdown("### Summary")
199
- with st.spinner(f'Generating summaries for {len(text_df)} texts consisting of a total of {text_chunk_counter} chunks (this may take a minute)...'):
200
- sum_dict = dict()
201
- for i, key in enumerate(text_chunks_lib):
202
- with st.expander(label=f'({i+1}/{len(text_df)}) Expand to see intermediate summary generation details for: {key}', expanded=False):
203
- # for key in text_chunks_lib:
204
- summary = []
205
- for num_chunk, text_chunk in enumerate(text_chunks_lib[key]):
206
- chunk_summary = md.summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens=300, minimum_tokens=20)
207
- summary.append(chunk_summary)
208
-
209
- st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
210
- st.markdown(text_chunk)
211
- st.markdown(f"###### Partial Summary {num_chunk+1}/{len(text_chunks)}")
212
- st.markdown(chunk_summary)
213
-
214
- # Combine all the summaries into a list and compress into one document, again
215
- final_summary = "\n\n".join(list(summary))
216
- sum_dict[key] = [final_summary]
217
-
218
- sum_df = pd.DataFrame.from_dict(sum_dict).T.reset_index()
219
- sum_df.columns = ['title', 'summary_text']
220
- # TO DO: Make sure summary_text does not exceed the token length
221
-
222
- st.dataframe(sum_df)
223
- st.download_button(
224
- label="Download data as CSV",
225
- data=sum_df.to_csv().encode('utf-8'),
226
- file_name='title_summary.csv',
227
- mime='title_summary/csv',
 
228
  )
229
 
230
  if ((len(text_input) == 0 and uploaded_text_files is None and uploaded_csv_text_files is None)
231
  or (len(labels) == 0 and uploaded_labels_file is None)):
232
  st.error('Enter some text and at least one possible topic to see label predictions.')
233
  else:
234
- st.markdown("### Top Label Predictions on Summary vs Full Text")
 
 
 
235
 
236
  if uploaded_labels_file is not None:
237
- labels_df = pd.read_csv(uploaded_labels_file)
238
  label_list = labels_df.iloc[:, 0]
239
  else:
240
  label_list = labels
241
- st.write(label_list)
242
 
243
- with st.spinner('Matching labels...'):
244
-
245
- labels_sum_col_list = ['title', 'label', 'scores_from_summary']
246
- labels_sum_df = pd.DataFrame(columns=labels_sum_col_list)
247
 
248
  labels_full_col_list = ['title', 'label', 'scores_from_full_text']
249
  labels_full_df = pd.DataFrame(columns=labels_full_col_list)
250
 
251
  for i in range(0, len(text_df)):
252
-
253
- s_topics, s_scores = md.classifier_zero(classifier, sequence=sum_df['summary_text'][i], labels=label_list, multi_class=True)
254
- ls_df = pd.DataFrame({'label': s_topics, 'scores_from_summary': s_scores})
255
- ls_df['title'] = text_df['title'][i]
256
- labels_sum_df = pd.concat([labels_sum_df, ls_df[labels_sum_col_list]])
257
 
258
  f_topics, f_scores = md.classifier_zero(classifier, sequence=text_df['text'][i], labels=label_list, multi_class=True)
259
  lf_df = pd.DataFrame({'label': f_topics, 'scores_from_full_text': f_scores})
260
  lf_df['title'] = text_df['title'][i]
261
  labels_full_df = pd.concat([labels_full_df, lf_df[labels_full_col_list]])
262
 
263
- label_match_df = pd.merge(labels_sum_df, labels_full_df, on=['title','label'])
 
 
 
 
 
 
 
 
 
 
 
264
  st.dataframe(label_match_df)
265
  st.download_button(
266
  label="Download data as CSV",
 
46
  st.write("__Option A:__")
47
  uploaded_text_files = st.file_uploader(label="Upload file(s) that end with the .txt suffix",
48
  accept_multiple_files=True, key = 'text_uploader',
49
+ type='txt')
50
  st.write("__Option B:__")
51
  uploaded_csv_text_files = st.file_uploader(label='Upload a CSV file with two columns: "title" and "text"',
52
  accept_multiple_files=False, key = 'csv_text_uploader',
53
+ type='csv')
54
 
55
  if text_input == display_text and display_text != '':
56
  text_input = example_text
57
 
58
+ gen_keywords = st.radio(
59
+ "Generate keywords from text? (independent from the input labels below)",
60
+ ('Yes', 'No')
61
+ )
62
+
63
+ gen_summary = st.radio(
64
+ "Generate summary from text? (recommended for label matching below, but will take longer)",
65
+ ('Yes', 'No')
66
+ )
67
 
68
  st.text("\n\n\n")
69
  st.markdown("##### Step 2: Enter Labels")
 
75
  uploaded_labels_file = st.file_uploader("Choose a CSV file with one column and no header, where each cell is a separate label",
76
  key='labels_uploader')
77
 
78
+ # summary_option = st.multiselect(
79
+ # "Match labels to text using?",
80
+ # ['Summary', 'Full Text'],
81
+ # ['Summary', 'Full Text']
82
+ # )
83
 
84
  st.text("\n\n\n")
85
  st.markdown("##### Step 3: Provide Ground Truth Labels (_Optional_)")
 
129
  else:
130
 
131
  if len(text_input) != 0:
132
+ text_df = pd.DataFrame.from_dict({'title': ['Submitted Text'], 'text': [text_input]})
133
 
134
+ # OPTION A
135
  elif uploaded_text_files is not None:
136
  st.markdown("### Text Inputs")
137
  st.write('Files concatenated into a dataframe:')
 
151
  file_name='title_text.csv',
152
  mime='title_text/csv',
153
  )
154
+ # OPTION B
155
+ elif uploaded_csv_text_files is not None:
156
+ text_df = pd.read_csv(uploaded_csv_text_files)
 
 
157
 
158
 
159
  with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
 
204
 
205
 
206
  st.markdown("### Summary")
207
+ if gen_summary == 'Yes':
208
+ with st.spinner(f'Generating summaries for {len(text_df)} texts consisting of a total of {text_chunk_counter} chunks (this may take a minute)...'):
209
+ sum_dict = dict()
210
+ for i, key in enumerate(text_chunks_lib):
211
+ with st.expander(label=f'({i+1}/{len(text_df)}) Expand to see intermediate summary generation details for: {key}', expanded=False):
212
+ # for key in text_chunks_lib:
213
+ summary = []
214
+ for num_chunk, text_chunk in enumerate(text_chunks_lib[key]):
215
+ chunk_summary = md.summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens=300, minimum_tokens=20)
216
+ summary.append(chunk_summary)
217
+
218
+ st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
219
+ st.markdown(text_chunk)
220
+ st.markdown(f"###### Partial Summary {num_chunk+1}/{len(text_chunks)}")
221
+ st.markdown(chunk_summary)
222
+
223
+ # Combine all the summaries into a list and compress into one document, again
224
+ final_summary = "\n\n".join(list(summary))
225
+ sum_dict[key] = [final_summary]
226
+
227
+ sum_df = pd.DataFrame.from_dict(sum_dict).T.reset_index()
228
+ sum_df.columns = ['title', 'summary_text']
229
+ # TO DO: Make sure summary_text does not exceed the token length
230
+
231
+ st.dataframe(sum_df)
232
+ st.download_button(
233
+ label="Download data as CSV",
234
+ data=sum_df.to_csv().encode('utf-8'),
235
+ file_name='title_summary.csv',
236
+ mime='title_summary/csv',
237
  )
238
 
239
  if ((len(text_input) == 0 and uploaded_text_files is None and uploaded_csv_text_files is None)
240
  or (len(labels) == 0 and uploaded_labels_file is None)):
241
  st.error('Enter some text and at least one possible topic to see label predictions.')
242
  else:
243
+ if gen_summary == 'Yes':
244
+ st.markdown("### Top Label Predictions on Summary vs Full Text")
245
+ else:
246
+ st.markdown("### Top Label Predictions on Full Text")
247
 
248
  if uploaded_labels_file is not None:
249
+ labels_df = pd.read_csv(uploaded_labels_file, header=None)
250
  label_list = labels_df.iloc[:, 0]
251
  else:
252
  label_list = labels
 
253
 
254
+ with st.spinner('Matching labels...(may take some time)'):
255
+ if gen_summary == 'Yes':
256
+ labels_sum_col_list = ['title', 'label', 'scores_from_summary']
257
+ labels_sum_df = pd.DataFrame(columns=labels_sum_col_list)
258
 
259
  labels_full_col_list = ['title', 'label', 'scores_from_full_text']
260
  labels_full_df = pd.DataFrame(columns=labels_full_col_list)
261
 
262
  for i in range(0, len(text_df)):
263
+ if gen_summary == 'Yes':
264
+ s_topics, s_scores = md.classifier_zero(classifier, sequence=sum_df['summary_text'][i], labels=label_list, multi_class=True)
265
+ ls_df = pd.DataFrame({'label': s_topics, 'scores_from_summary': s_scores})
266
+ ls_df['title'] = text_df['title'][i]
267
+ labels_sum_df = pd.concat([labels_sum_df, ls_df[labels_sum_col_list]])
268
 
269
  f_topics, f_scores = md.classifier_zero(classifier, sequence=text_df['text'][i], labels=label_list, multi_class=True)
270
  lf_df = pd.DataFrame({'label': f_topics, 'scores_from_full_text': f_scores})
271
  lf_df['title'] = text_df['title'][i]
272
  labels_full_df = pd.concat([labels_full_df, lf_df[labels_full_col_list]])
273
 
274
+ with st.expander(f'({i+1}/{len(text_df)}) See intermediate label matching results'):
275
+ st.write(f"Results for {text_df['title'][i]}")
276
+ if gen_summary == 'Yes':
277
+ st.dataframe(pd.merge(labels_sum_df, labels_full_df, on=['title','label']))
278
+ else:
279
+ st.dataframe(labels_full_df)
280
+
281
+ if gen_summary == 'Yes':
282
+ label_match_df = pd.merge(labels_sum_df, labels_full_df, on=['title','label'])
283
+ else:
284
+ label_match_df = labels_full_df.copy()
285
+
286
  st.dataframe(label_match_df)
287
  st.download_button(
288
  label="Download data as CSV",