Paula Leonova commited on
Commit
f60d1c6
1 Parent(s): 51fcc5c

Add keyBERT in order to generate top keywords

Browse files
Files changed (2) hide show
  1. app.py +25 -24
  2. models.py +2 -2
app.py CHANGED
@@ -51,7 +51,7 @@ with st.form(key='my_form'):
51
 
52
 
53
 
54
- with st.spinner('Loading pretrained summarizer and classifier mnli model...'):
55
  start = time.time()
56
  summarizer = md.load_summary_model()
57
  s_time = round(time.time() - start,4)
@@ -60,13 +60,11 @@ with st.spinner('Loading pretrained summarizer and classifier mnli model...'):
60
  classifier = md.load_model()
61
  c_time = round(time.time() - start,4)
62
 
63
- st.success(f'Time taken to load: summarizer mnli model {s_time}s & classifier mnli model {c_time}s')
64
-
65
- # with st.spinner('Loading pretrained classifier mnli model...'):
66
- # start = time.time()
67
- # classifier = md.load_model()
68
- # st.success(f'Time taken to load classifier mnli model: {round(time.time() - start,4)} seconds')
69
 
 
70
 
71
  if submit_button:
72
  if len(text_input) == 0:
@@ -80,22 +78,31 @@ if submit_button:
80
  for n in range(0, len(nested_sentences)):
81
  tc = " ".join(map(str, nested_sentences[n]))
82
  text_chunks.append(tc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
 
84
  with st.spinner('Generating summaries for text chunks...'):
85
 
86
- my_expander = st.expander(label='Expand to see summary generation details')
87
  with my_expander:
88
  summary = []
89
- st.markdown("### Text Chunk & Summaries")
90
- # st.markdown("_Breaks up the original text into sections with complete sentences totaling \
91
- # less than 1024 tokens, a requirement for the summarizer. Each block of text is than summarized separately \
92
- # and then combined at the very end to generate the final summary._")
93
-
94
- # # For each chunk of sentences (within the token max), generate a summary
95
- # for n in range(0, len(nested_sentences)):
96
- # text_chunk = " ".join(map(str, nested_sentences[n]))
97
- # st.markdown(f"###### Original Text Chunk {n+1}/{len(nested_sentences)}" )
98
- # st.markdown(text_chunk)
99
 
100
  for num_chunk, text_chunk in enumerate(text_chunks):
101
  st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
@@ -108,15 +115,9 @@ if submit_button:
108
  # Combine all the summaries into a list and compress into one document, again
109
  final_summary = " \n\n".join(list(summary))
110
 
111
- # final_summary = md.summarizer_gen(summarizer, sequence=text_input, maximum_tokens = 30, minimum_tokens = 100)
112
  st.markdown("### Combined Summary")
113
  st.markdown(final_summary)
114
 
115
- # if gen_keywords == 'Yes':
116
- # st.markdown("### Top Keywords")
117
- # with st.spinner("Generating keywords from text...")
118
- # keywords =
119
-
120
  if len(text_input) == 0 or len(labels) == 0:
121
  st.write('Enter some text and at least one possible topic to see predictions.')
122
  else:
51
 
52
 
53
 
54
+ with st.spinner('Loading pretrained models...'):
55
  start = time.time()
56
  summarizer = md.load_summary_model()
57
  s_time = round(time.time() - start,4)
60
  classifier = md.load_model()
61
  c_time = round(time.time() - start,4)
62
 
63
+ start = time.time()
64
+ kw_model = md.load_keyword_model()
65
+ k_time = round(time.time() - start,4)
 
 
 
66
 
67
+ st.success(f'Time taken to load BART summarizer mnli model: {s_time}s & BART classifier mnli model: {c_time}s & KeyBERT model: {k_time}s')
68
 
69
  if submit_button:
70
  if len(text_input) == 0:
78
  for n in range(0, len(nested_sentences)):
79
  tc = " ".join(map(str, nested_sentences[n]))
80
  text_chunks.append(tc)
81
+
82
+ if gen_keywords == 'Yes':
83
+ st.markdown("### Top Keywords")
84
+ with st.spinner("Generating keywords from text..."):
85
+
86
+ kw_df = pd.DataFrame()
87
+ for text_chunk in text_chunks:
88
+ keywords_list = md.keyword_gen(kw_model, text_chunk)
89
+ kw_df = kw_df.append(pd.DataFrame(keywords_list))
90
+ kw_df.columns = ['keyword', 'score']
91
+ top_kw_df = kw_df.groupby('keyword')['score'].max().reset_index()
92
+
93
+ top_kw_df = top_kw_df.sort_values('score', ascending = False).reset_index().drop(['index'], axis=1)
94
+ st.dataframe(top_kw_df.head(10))
95
 
96
+ st.markdown("### Text Chunk & Summaries")
97
  with st.spinner('Generating summaries for text chunks...'):
98
 
99
+ my_expander = st.expander(label='Expand to see intermediate summary generation details')
100
  with my_expander:
101
  summary = []
102
+
103
+ st.markdown("_The original text is broken into chunks with complete sentences totaling \
104
+ fewer than 1024 tokens, a requirement for the summarizer. Each block of text is then summarized separately \
105
+ and then combined at the very end to generate the final summary._")
 
 
 
 
 
 
106
 
107
  for num_chunk, text_chunk in enumerate(text_chunks):
108
  st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
115
  # Combine all the summaries into a list and compress into one document, again
116
  final_summary = " \n\n".join(list(summary))
117
 
 
118
  st.markdown("### Combined Summary")
119
  st.markdown(final_summary)
120
 
 
 
 
 
 
121
  if len(text_input) == 0 or len(labels) == 0:
122
  st.write('Enter some text and at least one possible topic to see predictions.')
123
  else:
models.py CHANGED
@@ -34,9 +34,9 @@ def create_nest_sentences(document:str, token_max_length = 1024):
34
  @st.cache(allow_output_mutation=True)
35
  def load_keyword_model():
36
  kw_model = KeyBERT()
37
- return ky_model
38
 
39
- def keyword_gen(sequence:str):
40
  keywords = kw_model.extract_keywords(sequence,
41
  keyphrase_ngram_range=(1, 1),
42
  stop_words='english',
34
  @st.cache(allow_output_mutation=True)
35
  def load_keyword_model():
36
  kw_model = KeyBERT()
37
+ return kw_model
38
 
39
+ def keyword_gen(kw_model, sequence:str):
40
  keywords = kw_model.extract_keywords(sequence,
41
  keyphrase_ngram_range=(1, 1),
42
  stop_words='english',