Paula Leonova
commited on
Commit
·
39c7695
1
Parent(s):
7055ca6
Add back option for single text entry
Browse files
app.py
CHANGED
@@ -19,8 +19,8 @@ ex_long_text = example_long_text_load()
|
|
19 |
|
20 |
# if __name__ == '__main__':
|
21 |
st.markdown("### Long Text Summarization & Multi-Label Classification")
|
22 |
-
st.write("This app summarizes and then classifies your long text with multiple labels using [BART Large MNLI](https://huggingface.co/facebook/bart-large-mnli). The keywords are generated using [KeyBERT](https://github.com/MaartenGr/KeyBERT).")
|
23 |
-
st.write("__Inputs__: User enters their own custom text and labels.")
|
24 |
st.write("__Outputs__: A summary of the text, likelihood percentages for each label and a downloadable csv of the results. \
|
25 |
Includes additional options to generate a list of keywords and/or evaluate results against a list of ground truth labels, if available.")
|
26 |
|
@@ -110,16 +110,19 @@ with st.spinner('Loading pretrained models...'):
|
|
110 |
kw_model = md.load_keyword_model()
|
111 |
k_time = round(time.time() - start,4)
|
112 |
|
113 |
-
st.
|
114 |
-
|
115 |
|
116 |
if submit_button or example_button:
|
117 |
if len(text_input) == 0 and uploaded_text_files is None and uploaded_csv_text_files is None:
|
118 |
st.error("Enter some text to generate a summary")
|
119 |
else:
|
120 |
|
|
|
|
|
|
|
121 |
# OPTION A:
|
122 |
-
|
123 |
st.markdown("### Text Inputs")
|
124 |
st.write('Files concatenated into a dataframe:')
|
125 |
file_names = []
|
@@ -141,6 +144,10 @@ if submit_button or example_button:
|
|
141 |
# OPTION B: [TO DO: DIRECT CSV UPLOAD INSTEAD]
|
142 |
|
143 |
|
|
|
|
|
|
|
|
|
144 |
with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
|
145 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
146 |
|
@@ -165,17 +172,22 @@ if submit_button or example_button:
|
|
165 |
for text_chunk in text_chunks_lib[key]:
|
166 |
keywords_list = md.keyword_gen(kw_model, text_chunk)
|
167 |
kw_dict[key] = dict(keywords_list)
|
168 |
-
|
169 |
kw_df0 = pd.DataFrame.from_dict(kw_dict).reset_index()
|
170 |
kw_df0.rename(columns={'index': 'keyword'}, inplace=True)
|
171 |
kw_df = pd.melt(kw_df0, id_vars=['keyword'], var_name='title', value_name='score').dropna()
|
172 |
-
|
|
|
|
|
|
|
|
|
|
|
173 |
st.dataframe(kw_df)
|
174 |
st.download_button(
|
175 |
label="Download data as CSV",
|
176 |
data=kw_df.to_csv().encode('utf-8'),
|
177 |
-
file_name='
|
178 |
-
mime='
|
179 |
)
|
180 |
|
181 |
|
|
|
19 |
|
20 |
# if __name__ == '__main__':
|
21 |
st.markdown("### Long Text Summarization & Multi-Label Classification")
|
22 |
+
st.write("This app summarizes and then classifies your long text(s) with multiple labels using [BART Large MNLI](https://huggingface.co/facebook/bart-large-mnli). The keywords are generated using [KeyBERT](https://github.com/MaartenGr/KeyBERT).")
|
23 |
+
st.write("__Inputs__: User enters their own custom text(s) and labels.")
|
24 |
st.write("__Outputs__: A summary of the text, likelihood percentages for each label and a downloadable csv of the results. \
|
25 |
Includes additional options to generate a list of keywords and/or evaluate results against a list of ground truth labels, if available.")
|
26 |
|
|
|
110 |
kw_model = md.load_keyword_model()
|
111 |
k_time = round(time.time() - start,4)
|
112 |
|
113 |
+
st.spinner(f'Time taken to load various models: {k_time}s for KeyBERT model & {s_time}s for BART summarizer mnli model & {c_time}s for BART classifier mnli model.')
|
114 |
+
# st.success(None)
|
115 |
|
116 |
if submit_button or example_button:
|
117 |
if len(text_input) == 0 and uploaded_text_files is None and uploaded_csv_text_files is None:
|
118 |
st.error("Enter some text to generate a summary")
|
119 |
else:
|
120 |
|
121 |
+
if len(text_input) != 0:
|
122 |
+
text_df = pd.DataFrame.from_dict({'title': ['sample'], 'text': [text_input]})
|
123 |
+
|
124 |
# OPTION A:
|
125 |
+
elif uploaded_text_files is not None:
|
126 |
st.markdown("### Text Inputs")
|
127 |
st.write('Files concatenated into a dataframe:')
|
128 |
file_names = []
|
|
|
144 |
# OPTION B: [TO DO: DIRECT CSV UPLOAD INSTEAD]
|
145 |
|
146 |
|
147 |
+
if len(text_input) != 0:
|
148 |
+
text_df = pd.DataFrame.from_dict({'title': ['sample'], 'text': [text_input]})
|
149 |
+
|
150 |
+
|
151 |
with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
|
152 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
153 |
|
|
|
172 |
for text_chunk in text_chunks_lib[key]:
|
173 |
keywords_list = md.keyword_gen(kw_model, text_chunk)
|
174 |
kw_dict[key] = dict(keywords_list)
|
175 |
+
# Display as a dataframe
|
176 |
kw_df0 = pd.DataFrame.from_dict(kw_dict).reset_index()
|
177 |
kw_df0.rename(columns={'index': 'keyword'}, inplace=True)
|
178 |
kw_df = pd.melt(kw_df0, id_vars=['keyword'], var_name='title', value_name='score').dropna()
|
179 |
+
if len(text_input) != 0:
|
180 |
+
title_element = []
|
181 |
+
else:
|
182 |
+
title_element = ['title']
|
183 |
+
kw_column_list = ['keyword', 'score']
|
184 |
+
kw_df = kw_df[kw_df['score'] > 0.1][title_element + kw_column_list].sort_values(title_element + ['score'], ascending=False).reset_index().drop(columns='index')
|
185 |
st.dataframe(kw_df)
|
186 |
st.download_button(
|
187 |
label="Download data as CSV",
|
188 |
data=kw_df.to_csv().encode('utf-8'),
|
189 |
+
file_name='title_keywords.csv',
|
190 |
+
mime='title_keywords/csv',
|
191 |
)
|
192 |
|
193 |
|