numBery commited on
Commit
7d24d84
1 Parent(s): 482860c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -40
app.py CHANGED
@@ -23,22 +23,19 @@ logger = logging.getLogger(__name__)
23
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
 
25
  HfFolder.save_token(st.secrets["hf-auth-token"])
26
- set_auth_token(st.secrets["hf-auth-token"])
27
 
28
  @st.cache(allow_output_mutation=True)
29
- def load_model():
30
  try:
31
  nltk.download('stopwords')
32
  nltk.download('punkt')
33
  # Load KeyBert Model
34
  tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
35
  kw_extractor = KeyBERT(tmp_model)
36
-
37
- # Load T5 for Paraphrasing
38
 
39
- t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
40
- #t5_model = export_and_get_onnx_model('valurank/t5-paraphraser')
41
  t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
 
42
  t5_model = t5_model.to(device)
43
  return kw_extractor, t5_model, t5_tokenizer
44
  except Exception:
@@ -49,11 +46,10 @@ def load_model():
49
  kw_extractor, t5_model, t5_tokenizer = load_model()
50
 
51
 
52
- @st.cache()
53
  def get_keybert_results_with_vectorizer(text, number_of_results=20):
54
-
55
  try:
56
  keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=number_of_results)
 
57
  return keywords
58
  except Exception:
59
  st.error('Error running Keybert. Please contact admin')
@@ -61,24 +57,22 @@ def get_keybert_results_with_vectorizer(text, number_of_results=20):
61
 
62
 
63
 
64
- @st.cache()
65
  def t5_paraphraser(text, number_of_results=5):
66
  try:
67
- text = "paraphrase: " + text + " </s>"
68
- max_len = 2048
69
  encoding = t5_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
70
  input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
71
-
72
  beam_outputs = t5_model.generate(
73
  input_ids=input_ids, attention_mask=attention_masks,
74
  do_sample=True,
75
- max_length=2048,
76
  top_k=50,
77
  top_p=0.95,
78
  early_stopping=True,
79
  num_return_sequences=number_of_results
80
  )
81
-
82
  final_outputs =[]
83
  for beam_output in beam_outputs:
84
  sent = t5_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
@@ -90,69 +84,119 @@ def t5_paraphraser(text, number_of_results=5):
90
 
91
 
92
 
93
- #### Extract Sentences with Keywords -> Paraphrase multiple versions -> Extract Keywords again
94
- def extract_paraphrased_sentences(article, number_of_keywords, number_of_paraphrases):
95
- try:
96
  start1 = time.time()
97
  with st.spinner('Extraction Keywords from Original Document...'):
98
- original_keywords = get_keybert_results_with_vectorizer(article)
99
-
100
  article_sentences = sent_tokenize(article)
101
- target_sentences = [sent for sent in article_sentences if any(kw[0] in sent for kw in original_keywords)]
 
102
  st.success('Keyword Extraction from Original Document finished in {}'.format(time.time() - start1))
103
-
 
 
104
  start2 = time.time()
105
  with st.spinner('Extracting Keywords from Paraphrased Target Sentences...'):
106
  t5_paraphrasing_keywords = []
107
-
108
  for sent in target_sentences:
109
  ### T5
110
  t5_paraphrased = t5_paraphraser(sent, number_of_results = number_of_paraphrases)
111
- t5_keywords = [get_keybert_results_with_vectorizer(i, number_of_results = number_of_keywords) for i in t5_paraphrased]
112
  t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
113
-
114
  t5_paraphrasing_keywords.extend(t5_keywords)
115
  st.success('Keyword Extraction from Paraphrased Target Sentences finished in {}'.format(time.time() - start2))
116
-
117
  original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])
118
-
119
  t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
120
-
121
  unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0], regex=False, case=False).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
122
-
123
  total_end = time.time()-start1
124
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
126
  except Exception:
127
  st.error('Error running Extraction Pipeline. Please contact admin')
128
- logger.error(traceback.format_exc())
 
 
 
 
129
 
130
 
131
  st.title('Exhaustive Keyword Extraction with Paraphrasing')
132
  with st.sidebar:
133
  st.header('Overview')
134
  st.markdown('This demo allows users to input text article and generate synonym-aware keywords. The pipeline includes the use of T5 Model for paraphrasing target sentences, and Sentence-transformers based Keyword Extraction')
135
-
136
  st.header('Parameters')
137
- number_of_keywords = st.slider('Number of Keywords to extract for each target sentence', min_value=5, max_value=50, step=5, value=20)
138
  number_of_paraphrases = st.slider('Number of Paraphrased versions to generate for each target sentence', min_value=1, max_value=20, step=1, value=5)
139
-
140
  st.header('Specifications')
141
- st.markdown('To generate context aware and OOV keywords, we first run KeyBert for keyword extraction on the original article. The sentences which had Keywords are then passed through T5 for generating multiple paraphrased versions. These paraphrased sentences are then run through Keyword Extraction again to generate the final results')
 
 
142
 
143
  doc = st.text_area("Enter a custom document")
144
  if doc:
145
- t5_keywords_df, original_keywords_df, unique_keywords_df, total_end = extract_paraphrased_sentences(doc, number_of_keywords, number_of_paraphrases)
146
-
147
  # extract_paraphrased_article(input_list[0])
148
  st.text(f'PIPELINE RUNTIME: {total_end}\n')
149
-
150
  st.subheader('\nOriginal Keywords Extracted:\n\n')
151
  st.dataframe(original_keywords_df)
152
-
153
  st.subheader('\nT5 Unique New Keywords Extracted:\n\n')
154
  st.dataframe(unique_keywords_df)
155
-
156
  st.subheader('\nT5 Keywords Extracted:\n\n')
157
  st.dataframe(t5_keywords_df)
158
-
 
23
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
 
25
  HfFolder.save_token(st.secrets["hf-auth-token"])
 
26
 
27
  @st.cache(allow_output_mutation=True)
28
+ def load_base_model():
29
  try:
30
  nltk.download('stopwords')
31
  nltk.download('punkt')
32
  # Load KeyBert Model
33
  tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
34
  kw_extractor = KeyBERT(tmp_model)
 
 
35
 
36
+ # Load T5 for Paraphrasing
 
37
  t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
38
+ t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
39
  t5_model = t5_model.to(device)
40
  return kw_extractor, t5_model, t5_tokenizer
41
  except Exception:
 
46
  kw_extractor, t5_model, t5_tokenizer = load_model()
47
 
48
 
 
49
  def get_keybert_results_with_vectorizer(text, number_of_results=20):
 
50
  try:
51
  keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=number_of_results)
52
+ keywords = [i for i in keywords if i[1] >= 0.25]
53
  return keywords
54
  except Exception:
55
  st.error('Error running Keybert. Please contact admin')
 
57
 
58
 
59
 
 
60
  def t5_paraphraser(text, number_of_results=5):
61
  try:
62
+ text = "paraphrase: " + text
 
63
  encoding = t5_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
64
  input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
65
+
66
  beam_outputs = t5_model.generate(
67
  input_ids=input_ids, attention_mask=attention_masks,
68
  do_sample=True,
69
+ max_length=1024,
70
  top_k=50,
71
  top_p=0.95,
72
  early_stopping=True,
73
  num_return_sequences=number_of_results
74
  )
75
+
76
  final_outputs =[]
77
  for beam_output in beam_outputs:
78
  sent = t5_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
 
84
 
85
 
86
 
87
+ def run_long_extraction(article, number_of_paraphrases):
88
+ try:
 
89
  start1 = time.time()
90
  with st.spinner('Extraction Keywords from Original Document...'):
91
+ original_keywords = get_keybert_results_with_vectorizer(article, number_of_results=30)
 
92
  article_sentences = sent_tokenize(article)
93
+ target_sentences = [sent for sent in article_sentences if any(kw[0] in sent for kw in original_keywords)]
94
+
95
  st.success('Keyword Extraction from Original Document finished in {}'.format(time.time() - start1))
96
+ st.info(f'Total Sentences in Article : {len(article_sentences)}')
97
+ st.info(f'Total Target Sentences Selected : {len(target_sentences)}')
98
+
99
  start2 = time.time()
100
  with st.spinner('Extracting Keywords from Paraphrased Target Sentences...'):
101
  t5_paraphrasing_keywords = []
 
102
  for sent in target_sentences:
103
  ### T5
104
  t5_paraphrased = t5_paraphraser(sent, number_of_results = number_of_paraphrases)
105
+ t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
106
  t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
 
107
  t5_paraphrasing_keywords.extend(t5_keywords)
108
  st.success('Keyword Extraction from Paraphrased Target Sentences finished in {}'.format(time.time() - start2))
109
+
110
  original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])
111
+
112
  t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
113
+
114
  unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0], regex=False, case=False).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
115
+
116
  total_end = time.time()-start1
117
+
118
+ return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
119
+ except Exception:
120
+ st.error('Error running Extraction Pipeline. Please contact admin')
121
+ logger.error(traceback.format_exc())
122
+
123
+
124
+
125
+ def run_short_extraction(article, number_of_paraphrases):
126
+ try:
127
+ start1 = time.time()
128
+ original_keywords = get_keybert_results_with_vectorizer(article)
129
+ article_sentences = sent_tokenize(article)
130
+ st.info(f'Total Sentences in Article : {len(article_sentences)}')
131
+
132
+ target_sentences = []
133
+ tmp = []
134
+ token_count = 0
135
+ for i in article_sentences:
136
+ enc = t5_tokenizer.encode(i)
137
+ if token_count + len(enc) <= 96:
138
+ tmp.append(i)
139
+ token_count += len(enc)
140
+ else:
141
+ target_sentences.append(' '.join(tmp))
142
+ token_count = len(enc)
143
+ tmp = [i]
144
+
145
+ start2 = time.time()
146
+ with st.spinner('Extracting Keywords from Paraphrased Sentences Groups...'):
147
+ t5_paraphrasing_keywords = []
148
+ for sent in target_sentences:
149
+ ### T5
150
+ t5_paraphrased = t5_paraphraser(sent, number_of_results = number_of_paraphrases)
151
+ t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
152
+ t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
153
+ t5_paraphrasing_keywords.extend(t5_keywords)
154
+ st.success('Keyword Extraction from Paraphrased Grouped Sentences finished in {}'.format(time.time() - start2))
155
+
156
+ original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])
157
+
158
+ t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
159
+
160
+ unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0], regex=False, case=False).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
161
+
162
+ total_end = time.time()-start1
163
+
164
  return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
165
  except Exception:
166
  st.error('Error running Extraction Pipeline. Please contact admin')
167
+ logger.error(traceback.format_exc())
168
+
169
+
170
+
171
+ kw_extractor, t5_model, t5_tokenizer = load_base_model()
172
 
173
 
174
  st.title('Exhaustive Keyword Extraction with Paraphrasing')
175
  with st.sidebar:
176
  st.header('Overview')
177
  st.markdown('This demo allows users to input text article and generate synonym-aware keywords. The pipeline includes the use of T5 Model for paraphrasing target sentences, and Sentence-transformers based Keyword Extraction')
178
+
179
  st.header('Parameters')
180
+ # number_of_keywords = st.slider('Number of Keywords to extract for each target sentence', min_value=5, max_value=50, step=5, value=20)
181
  number_of_paraphrases = st.slider('Number of Paraphrased versions to generate for each target sentence', min_value=1, max_value=20, step=1, value=5)
182
+
183
  st.header('Specifications')
184
+ # st.markdown('To generate context aware and OOV keywords for long, we first run KeyBert for keyword extraction on the original article. The sentences which had Keywords are then passed through T5 for generating multiple paraphrased versions. These paraphrased sentences are then run through Keyword Extraction again to generate the final results')
185
+
186
+
187
 
188
  doc = st.text_area("Enter a custom document")
189
  if doc:
190
+ t5_keywords_df, original_keywords_df, unique_keywords_df, total_end = check_document_length(doc, number_of_paraphrases)
191
+
192
  # extract_paraphrased_article(input_list[0])
193
  st.text(f'PIPELINE RUNTIME: {total_end}\n')
194
+
195
  st.subheader('\nOriginal Keywords Extracted:\n\n')
196
  st.dataframe(original_keywords_df)
197
+
198
  st.subheader('\nT5 Unique New Keywords Extracted:\n\n')
199
  st.dataframe(unique_keywords_df)
200
+
201
  st.subheader('\nT5 Keywords Extracted:\n\n')
202
  st.dataframe(t5_keywords_df)