Paula Leonova
commited on
Commit
•
b1bf232
1
Parent(s):
009207e
Add option B of loading text csv
Browse files
app.py
CHANGED
@@ -46,15 +46,24 @@ with st.form(key='my_form'):
|
|
46 |
st.write("__Option A:__")
|
47 |
uploaded_text_files = st.file_uploader(label="Upload file(s) that end with the .txt suffix",
|
48 |
accept_multiple_files=True, key = 'text_uploader',
|
49 |
-
type
|
50 |
st.write("__Option B:__")
|
51 |
uploaded_csv_text_files = st.file_uploader(label='Upload a CSV file with two columns: "title" and "text"',
|
52 |
accept_multiple_files=False, key = 'csv_text_uploader',
|
53 |
-
type
|
54 |
|
55 |
if text_input == display_text and display_text != '':
|
56 |
text_input = example_text
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
st.text("\n\n\n")
|
60 |
st.markdown("##### Step 2: Enter Labels")
|
@@ -66,10 +75,11 @@ with st.form(key='my_form'):
|
|
66 |
uploaded_labels_file = st.file_uploader("Choose a CSV file with one column and no header, where each cell is a separate label",
|
67 |
key='labels_uploader')
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
73 |
|
74 |
st.text("\n\n\n")
|
75 |
st.markdown("##### Step 3: Provide Ground Truth Labels (_Optional_)")
|
@@ -119,9 +129,9 @@ if submit_button or example_button:
|
|
119 |
else:
|
120 |
|
121 |
if len(text_input) != 0:
|
122 |
-
text_df = pd.DataFrame.from_dict({'title': ['
|
123 |
|
124 |
-
# OPTION A
|
125 |
elif uploaded_text_files is not None:
|
126 |
st.markdown("### Text Inputs")
|
127 |
st.write('Files concatenated into a dataframe:')
|
@@ -141,11 +151,9 @@ if submit_button or example_button:
|
|
141 |
file_name='title_text.csv',
|
142 |
mime='title_text/csv',
|
143 |
)
|
144 |
-
# OPTION B
|
145 |
-
|
146 |
-
|
147 |
-
if len(text_input) != 0:
|
148 |
-
text_df = pd.DataFrame.from_dict({'title': ['Submitted Text'], 'text': [text_input]})
|
149 |
|
150 |
|
151 |
with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
|
@@ -196,71 +204,85 @@ if submit_button or example_button:
|
|
196 |
|
197 |
|
198 |
st.markdown("### Summary")
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
|
|
228 |
)
|
229 |
|
230 |
if ((len(text_input) == 0 and uploaded_text_files is None and uploaded_csv_text_files is None)
|
231 |
or (len(labels) == 0 and uploaded_labels_file is None)):
|
232 |
st.error('Enter some text and at least one possible topic to see label predictions.')
|
233 |
else:
|
234 |
-
|
|
|
|
|
|
|
235 |
|
236 |
if uploaded_labels_file is not None:
|
237 |
-
labels_df = pd.read_csv(uploaded_labels_file)
|
238 |
label_list = labels_df.iloc[:, 0]
|
239 |
else:
|
240 |
label_list = labels
|
241 |
-
st.write(label_list)
|
242 |
|
243 |
-
with st.spinner('Matching labels...'):
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
|
248 |
labels_full_col_list = ['title', 'label', 'scores_from_full_text']
|
249 |
labels_full_df = pd.DataFrame(columns=labels_full_col_list)
|
250 |
|
251 |
for i in range(0, len(text_df)):
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
|
258 |
f_topics, f_scores = md.classifier_zero(classifier, sequence=text_df['text'][i], labels=label_list, multi_class=True)
|
259 |
lf_df = pd.DataFrame({'label': f_topics, 'scores_from_full_text': f_scores})
|
260 |
lf_df['title'] = text_df['title'][i]
|
261 |
labels_full_df = pd.concat([labels_full_df, lf_df[labels_full_col_list]])
|
262 |
|
263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
st.dataframe(label_match_df)
|
265 |
st.download_button(
|
266 |
label="Download data as CSV",
|
|
|
46 |
st.write("__Option A:__")
|
47 |
uploaded_text_files = st.file_uploader(label="Upload file(s) that end with the .txt suffix",
|
48 |
accept_multiple_files=True, key = 'text_uploader',
|
49 |
+
type='txt')
|
50 |
st.write("__Option B:__")
|
51 |
uploaded_csv_text_files = st.file_uploader(label='Upload a CSV file with two columns: "title" and "text"',
|
52 |
accept_multiple_files=False, key = 'csv_text_uploader',
|
53 |
+
type='csv')
|
54 |
|
55 |
if text_input == display_text and display_text != '':
|
56 |
text_input = example_text
|
57 |
|
58 |
+
gen_keywords = st.radio(
|
59 |
+
"Generate keywords from text? (independent from the input labels below)",
|
60 |
+
('Yes', 'No')
|
61 |
+
)
|
62 |
+
|
63 |
+
gen_summary = st.radio(
|
64 |
+
"Generate summary from text? (recommended for label matching below, but will take longer)",
|
65 |
+
('Yes', 'No')
|
66 |
+
)
|
67 |
|
68 |
st.text("\n\n\n")
|
69 |
st.markdown("##### Step 2: Enter Labels")
|
|
|
75 |
uploaded_labels_file = st.file_uploader("Choose a CSV file with one column and no header, where each cell is a separate label",
|
76 |
key='labels_uploader')
|
77 |
|
78 |
+
# summary_option = st.multiselect(
|
79 |
+
# "Match labels to text using?",
|
80 |
+
# ['Summary', 'Full Text'],
|
81 |
+
# ['Summary', 'Full Text']
|
82 |
+
# )
|
83 |
|
84 |
st.text("\n\n\n")
|
85 |
st.markdown("##### Step 3: Provide Ground Truth Labels (_Optional_)")
|
|
|
129 |
else:
|
130 |
|
131 |
if len(text_input) != 0:
|
132 |
+
text_df = pd.DataFrame.from_dict({'title': ['Submitted Text'], 'text': [text_input]})
|
133 |
|
134 |
+
# OPTION A
|
135 |
elif uploaded_text_files is not None:
|
136 |
st.markdown("### Text Inputs")
|
137 |
st.write('Files concatenated into a dataframe:')
|
|
|
151 |
file_name='title_text.csv',
|
152 |
mime='title_text/csv',
|
153 |
)
|
154 |
+
# OPTION B
|
155 |
+
elif uploaded_csv_text_files is not None:
|
156 |
+
text_df = pd.read_csv(uploaded_csv_text_files)
|
|
|
|
|
157 |
|
158 |
|
159 |
with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
|
|
|
204 |
|
205 |
|
206 |
st.markdown("### Summary")
|
207 |
+
if gen_summary == 'Yes':
|
208 |
+
with st.spinner(f'Generating summaries for {len(text_df)} texts consisting of a total of {text_chunk_counter} chunks (this may take a minute)...'):
|
209 |
+
sum_dict = dict()
|
210 |
+
for i, key in enumerate(text_chunks_lib):
|
211 |
+
with st.expander(label=f'({i+1}/{len(text_df)}) Expand to see intermediate summary generation details for: {key}', expanded=False):
|
212 |
+
# for key in text_chunks_lib:
|
213 |
+
summary = []
|
214 |
+
for num_chunk, text_chunk in enumerate(text_chunks_lib[key]):
|
215 |
+
chunk_summary = md.summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens=300, minimum_tokens=20)
|
216 |
+
summary.append(chunk_summary)
|
217 |
+
|
218 |
+
st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
|
219 |
+
st.markdown(text_chunk)
|
220 |
+
st.markdown(f"###### Partial Summary {num_chunk+1}/{len(text_chunks)}")
|
221 |
+
st.markdown(chunk_summary)
|
222 |
+
|
223 |
+
# Combine all the summaries into a list and compress into one document, again
|
224 |
+
final_summary = "\n\n".join(list(summary))
|
225 |
+
sum_dict[key] = [final_summary]
|
226 |
+
|
227 |
+
sum_df = pd.DataFrame.from_dict(sum_dict).T.reset_index()
|
228 |
+
sum_df.columns = ['title', 'summary_text']
|
229 |
+
# TO DO: Make sure summary_text does not exceed the token length
|
230 |
+
|
231 |
+
st.dataframe(sum_df)
|
232 |
+
st.download_button(
|
233 |
+
label="Download data as CSV",
|
234 |
+
data=sum_df.to_csv().encode('utf-8'),
|
235 |
+
file_name='title_summary.csv',
|
236 |
+
mime='title_summary/csv',
|
237 |
)
|
238 |
|
239 |
if ((len(text_input) == 0 and uploaded_text_files is None and uploaded_csv_text_files is None)
|
240 |
or (len(labels) == 0 and uploaded_labels_file is None)):
|
241 |
st.error('Enter some text and at least one possible topic to see label predictions.')
|
242 |
else:
|
243 |
+
if gen_summary == 'Yes':
|
244 |
+
st.markdown("### Top Label Predictions on Summary vs Full Text")
|
245 |
+
else:
|
246 |
+
st.markdown("### Top Label Predictions on Full Text")
|
247 |
|
248 |
if uploaded_labels_file is not None:
|
249 |
+
labels_df = pd.read_csv(uploaded_labels_file, header=None)
|
250 |
label_list = labels_df.iloc[:, 0]
|
251 |
else:
|
252 |
label_list = labels
|
|
|
253 |
|
254 |
+
with st.spinner('Matching labels...(may take some time)'):
|
255 |
+
if gen_summary == 'Yes':
|
256 |
+
labels_sum_col_list = ['title', 'label', 'scores_from_summary']
|
257 |
+
labels_sum_df = pd.DataFrame(columns=labels_sum_col_list)
|
258 |
|
259 |
labels_full_col_list = ['title', 'label', 'scores_from_full_text']
|
260 |
labels_full_df = pd.DataFrame(columns=labels_full_col_list)
|
261 |
|
262 |
for i in range(0, len(text_df)):
|
263 |
+
if gen_summary == 'Yes':
|
264 |
+
s_topics, s_scores = md.classifier_zero(classifier, sequence=sum_df['summary_text'][i], labels=label_list, multi_class=True)
|
265 |
+
ls_df = pd.DataFrame({'label': s_topics, 'scores_from_summary': s_scores})
|
266 |
+
ls_df['title'] = text_df['title'][i]
|
267 |
+
labels_sum_df = pd.concat([labels_sum_df, ls_df[labels_sum_col_list]])
|
268 |
|
269 |
f_topics, f_scores = md.classifier_zero(classifier, sequence=text_df['text'][i], labels=label_list, multi_class=True)
|
270 |
lf_df = pd.DataFrame({'label': f_topics, 'scores_from_full_text': f_scores})
|
271 |
lf_df['title'] = text_df['title'][i]
|
272 |
labels_full_df = pd.concat([labels_full_df, lf_df[labels_full_col_list]])
|
273 |
|
274 |
+
with st.expander(f'({i+1}/{len(text_df)}) See intermediate label matching results'):
|
275 |
+
st.write(f"Results for {text_df['title'][i]}")
|
276 |
+
if gen_summary == 'Yes':
|
277 |
+
st.dataframe(pd.merge(labels_sum_df, labels_full_df, on=['title','label']))
|
278 |
+
else:
|
279 |
+
st.dataframe(labels_full_df)
|
280 |
+
|
281 |
+
if gen_summary == 'Yes':
|
282 |
+
label_match_df = pd.merge(labels_sum_df, labels_full_df, on=['title','label'])
|
283 |
+
else:
|
284 |
+
label_match_df = labels_full_df.copy()
|
285 |
+
|
286 |
st.dataframe(label_match_df)
|
287 |
st.download_button(
|
288 |
label="Download data as CSV",
|