Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -23,22 +23,19 @@ logger = logging.getLogger(__name__)
|
|
23 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
24 |
|
25 |
HfFolder.save_token(st.secrets["hf-auth-token"])
|
26 |
-
set_auth_token(st.secrets["hf-auth-token"])
|
27 |
|
28 |
@st.cache(allow_output_mutation=True)
|
29 |
-
def
|
30 |
try:
|
31 |
nltk.download('stopwords')
|
32 |
nltk.download('punkt')
|
33 |
# Load KeyBert Model
|
34 |
tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
|
35 |
kw_extractor = KeyBERT(tmp_model)
|
36 |
-
|
37 |
-
# Load T5 for Paraphrasing
|
38 |
|
39 |
-
|
40 |
-
#t5_model = export_and_get_onnx_model('valurank/t5-paraphraser')
|
41 |
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
|
|
|
42 |
t5_model = t5_model.to(device)
|
43 |
return kw_extractor, t5_model, t5_tokenizer
|
44 |
except Exception:
|
@@ -49,11 +46,10 @@ def load_model():
|
|
49 |
kw_extractor, t5_model, t5_tokenizer = load_model()
|
50 |
|
51 |
|
52 |
-
@st.cache()
|
53 |
def get_keybert_results_with_vectorizer(text, number_of_results=20):
|
54 |
-
|
55 |
try:
|
56 |
keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=number_of_results)
|
|
|
57 |
return keywords
|
58 |
except Exception:
|
59 |
st.error('Error running Keybert. Please contact admin')
|
@@ -61,24 +57,22 @@ def get_keybert_results_with_vectorizer(text, number_of_results=20):
|
|
61 |
|
62 |
|
63 |
|
64 |
-
@st.cache()
|
65 |
def t5_paraphraser(text, number_of_results=5):
|
66 |
try:
|
67 |
-
text = "paraphrase: " + text
|
68 |
-
max_len = 2048
|
69 |
encoding = t5_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
|
70 |
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
|
71 |
-
|
72 |
beam_outputs = t5_model.generate(
|
73 |
input_ids=input_ids, attention_mask=attention_masks,
|
74 |
do_sample=True,
|
75 |
-
max_length=
|
76 |
top_k=50,
|
77 |
top_p=0.95,
|
78 |
early_stopping=True,
|
79 |
num_return_sequences=number_of_results
|
80 |
)
|
81 |
-
|
82 |
final_outputs =[]
|
83 |
for beam_output in beam_outputs:
|
84 |
sent = t5_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
@@ -90,69 +84,119 @@ def t5_paraphraser(text, number_of_results=5):
|
|
90 |
|
91 |
|
92 |
|
93 |
-
|
94 |
-
|
95 |
-
try:
|
96 |
start1 = time.time()
|
97 |
with st.spinner('Extraction Keywords from Original Document...'):
|
98 |
-
original_keywords = get_keybert_results_with_vectorizer(article)
|
99 |
-
|
100 |
article_sentences = sent_tokenize(article)
|
101 |
-
target_sentences = [sent for sent in article_sentences if any(kw[0] in sent for kw in original_keywords)]
|
|
|
102 |
st.success('Keyword Extraction from Original Document finished in {}'.format(time.time() - start1))
|
103 |
-
|
|
|
|
|
104 |
start2 = time.time()
|
105 |
with st.spinner('Extracting Keywords from Paraphrased Target Sentences...'):
|
106 |
t5_paraphrasing_keywords = []
|
107 |
-
|
108 |
for sent in target_sentences:
|
109 |
### T5
|
110 |
t5_paraphrased = t5_paraphraser(sent, number_of_results = number_of_paraphrases)
|
111 |
-
t5_keywords = [get_keybert_results_with_vectorizer(i
|
112 |
t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
|
113 |
-
|
114 |
t5_paraphrasing_keywords.extend(t5_keywords)
|
115 |
st.success('Keyword Extraction from Paraphrased Target Sentences finished in {}'.format(time.time() - start2))
|
116 |
-
|
117 |
original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])
|
118 |
-
|
119 |
t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
|
120 |
-
|
121 |
unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0], regex=False, case=False).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
|
122 |
-
|
123 |
total_end = time.time()-start1
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
|
126 |
except Exception:
|
127 |
st.error('Error running Extraction Pipeline. Please contact admin')
|
128 |
-
logger.error(traceback.format_exc())
|
|
|
|
|
|
|
|
|
129 |
|
130 |
|
131 |
st.title('Exhaustive Keyword Extraction with Paraphrasing')
|
132 |
with st.sidebar:
|
133 |
st.header('Overview')
|
134 |
st.markdown('This demo allows users to input text article and generate synonym-aware keywords. The pipeline includes the use of T5 Model for paraphrasing target sentences, and Sentence-transformers based Keyword Extraction')
|
135 |
-
|
136 |
st.header('Parameters')
|
137 |
-
number_of_keywords = st.slider('Number of Keywords to extract for each target sentence', min_value=5, max_value=50, step=5, value=20)
|
138 |
number_of_paraphrases = st.slider('Number of Paraphrased versions to generate for each target sentence', min_value=1, max_value=20, step=1, value=5)
|
139 |
-
|
140 |
st.header('Specifications')
|
141 |
-
st.markdown('To generate context aware and OOV keywords, we first run KeyBert for keyword extraction on the original article. The sentences which had Keywords are then passed through T5 for generating multiple paraphrased versions. These paraphrased sentences are then run through Keyword Extraction again to generate the final results')
|
|
|
|
|
142 |
|
143 |
doc = st.text_area("Enter a custom document")
|
144 |
if doc:
|
145 |
-
t5_keywords_df, original_keywords_df, unique_keywords_df, total_end =
|
146 |
-
|
147 |
# extract_paraphrased_article(input_list[0])
|
148 |
st.text(f'PIPELINE RUNTIME: {total_end}\n')
|
149 |
-
|
150 |
st.subheader('\nOriginal Keywords Extracted:\n\n')
|
151 |
st.dataframe(original_keywords_df)
|
152 |
-
|
153 |
st.subheader('\nT5 Unique New Keywords Extracted:\n\n')
|
154 |
st.dataframe(unique_keywords_df)
|
155 |
-
|
156 |
st.subheader('\nT5 Keywords Extracted:\n\n')
|
157 |
st.dataframe(t5_keywords_df)
|
158 |
-
|
|
|
23 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
24 |
|
25 |
HfFolder.save_token(st.secrets["hf-auth-token"])
|
|
|
26 |
|
27 |
@st.cache(allow_output_mutation=True)
|
28 |
+
def load_base_model():
|
29 |
try:
|
30 |
nltk.download('stopwords')
|
31 |
nltk.download('punkt')
|
32 |
# Load KeyBert Model
|
33 |
tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
|
34 |
kw_extractor = KeyBERT(tmp_model)
|
|
|
|
|
35 |
|
36 |
+
# Load T5 for Paraphrasing
|
|
|
37 |
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
|
38 |
+
t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
|
39 |
t5_model = t5_model.to(device)
|
40 |
return kw_extractor, t5_model, t5_tokenizer
|
41 |
except Exception:
|
|
|
46 |
kw_extractor, t5_model, t5_tokenizer = load_model()
|
47 |
|
48 |
|
|
|
49 |
def get_keybert_results_with_vectorizer(text, number_of_results=20):
|
|
|
50 |
try:
|
51 |
keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=number_of_results)
|
52 |
+
keywords = [i for i in keywords if i[1] >= 0.25]
|
53 |
return keywords
|
54 |
except Exception:
|
55 |
st.error('Error running Keybert. Please contact admin')
|
|
|
57 |
|
58 |
|
59 |
|
|
|
60 |
def t5_paraphraser(text, number_of_results=5):
|
61 |
try:
|
62 |
+
text = "paraphrase: " + text
|
|
|
63 |
encoding = t5_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
|
64 |
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
|
65 |
+
|
66 |
beam_outputs = t5_model.generate(
|
67 |
input_ids=input_ids, attention_mask=attention_masks,
|
68 |
do_sample=True,
|
69 |
+
max_length=1024,
|
70 |
top_k=50,
|
71 |
top_p=0.95,
|
72 |
early_stopping=True,
|
73 |
num_return_sequences=number_of_results
|
74 |
)
|
75 |
+
|
76 |
final_outputs =[]
|
77 |
for beam_output in beam_outputs:
|
78 |
sent = t5_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
|
|
84 |
|
85 |
|
86 |
|
87 |
+
def run_long_extraction(article, number_of_paraphrases):
|
88 |
+
try:
|
|
|
89 |
start1 = time.time()
|
90 |
with st.spinner('Extraction Keywords from Original Document...'):
|
91 |
+
original_keywords = get_keybert_results_with_vectorizer(article, number_of_results=30)
|
|
|
92 |
article_sentences = sent_tokenize(article)
|
93 |
+
target_sentences = [sent for sent in article_sentences if any(kw[0] in sent for kw in original_keywords)]
|
94 |
+
|
95 |
st.success('Keyword Extraction from Original Document finished in {}'.format(time.time() - start1))
|
96 |
+
st.info(f'Total Sentences in Article : {len(article_sentences)}')
|
97 |
+
st.info(f'Total Target Sentences Selected : {len(target_sentences)}')
|
98 |
+
|
99 |
start2 = time.time()
|
100 |
with st.spinner('Extracting Keywords from Paraphrased Target Sentences...'):
|
101 |
t5_paraphrasing_keywords = []
|
|
|
102 |
for sent in target_sentences:
|
103 |
### T5
|
104 |
t5_paraphrased = t5_paraphraser(sent, number_of_results = number_of_paraphrases)
|
105 |
+
t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
|
106 |
t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
|
|
|
107 |
t5_paraphrasing_keywords.extend(t5_keywords)
|
108 |
st.success('Keyword Extraction from Paraphrased Target Sentences finished in {}'.format(time.time() - start2))
|
109 |
+
|
110 |
original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])
|
111 |
+
|
112 |
t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
|
113 |
+
|
114 |
unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0], regex=False, case=False).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
|
115 |
+
|
116 |
total_end = time.time()-start1
|
117 |
+
|
118 |
+
return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
|
119 |
+
except Exception:
|
120 |
+
st.error('Error running Extraction Pipeline. Please contact admin')
|
121 |
+
logger.error(traceback.format_exc())
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
def run_short_extraction(article, number_of_paraphrases):
|
126 |
+
try:
|
127 |
+
start1 = time.time()
|
128 |
+
original_keywords = get_keybert_results_with_vectorizer(article)
|
129 |
+
article_sentences = sent_tokenize(article)
|
130 |
+
st.info(f'Total Sentences in Article : {len(article_sentences)}')
|
131 |
+
|
132 |
+
target_sentences = []
|
133 |
+
tmp = []
|
134 |
+
token_count = 0
|
135 |
+
for i in article_sentences:
|
136 |
+
enc = t5_tokenizer.encode(i)
|
137 |
+
if token_count + len(enc) <= 96:
|
138 |
+
tmp.append(i)
|
139 |
+
token_count += len(enc)
|
140 |
+
else:
|
141 |
+
target_sentences.append(' '.join(tmp))
|
142 |
+
token_count = len(enc)
|
143 |
+
tmp = [i]
|
144 |
+
|
145 |
+
start2 = time.time()
|
146 |
+
with st.spinner('Extracting Keywords from Paraphrased Sentences Groups...'):
|
147 |
+
t5_paraphrasing_keywords = []
|
148 |
+
for sent in target_sentences:
|
149 |
+
### T5
|
150 |
+
t5_paraphrased = t5_paraphraser(sent, number_of_results = number_of_paraphrases)
|
151 |
+
t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
|
152 |
+
t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
|
153 |
+
t5_paraphrasing_keywords.extend(t5_keywords)
|
154 |
+
st.success('Keyword Extraction from Paraphrased Grouped Sentences finished in {}'.format(time.time() - start2))
|
155 |
+
|
156 |
+
original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])
|
157 |
+
|
158 |
+
t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
|
159 |
+
|
160 |
+
unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0], regex=False, case=False).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
|
161 |
+
|
162 |
+
total_end = time.time()-start1
|
163 |
+
|
164 |
return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
|
165 |
except Exception:
|
166 |
st.error('Error running Extraction Pipeline. Please contact admin')
|
167 |
+
logger.error(traceback.format_exc())
|
168 |
+
|
169 |
+
|
170 |
+
|
171 |
+
kw_extractor, t5_model, t5_tokenizer = load_base_model()
|
172 |
|
173 |
|
174 |
st.title('Exhaustive Keyword Extraction with Paraphrasing')
|
175 |
with st.sidebar:
|
176 |
st.header('Overview')
|
177 |
st.markdown('This demo allows users to input text article and generate synonym-aware keywords. The pipeline includes the use of T5 Model for paraphrasing target sentences, and Sentence-transformers based Keyword Extraction')
|
178 |
+
|
179 |
st.header('Parameters')
|
180 |
+
# number_of_keywords = st.slider('Number of Keywords to extract for each target sentence', min_value=5, max_value=50, step=5, value=20)
|
181 |
number_of_paraphrases = st.slider('Number of Paraphrased versions to generate for each target sentence', min_value=1, max_value=20, step=1, value=5)
|
182 |
+
|
183 |
st.header('Specifications')
|
184 |
+
# st.markdown('To generate context aware and OOV keywords for long, we first run KeyBert for keyword extraction on the original article. The sentences which had Keywords are then passed through T5 for generating multiple paraphrased versions. These paraphrased sentences are then run through Keyword Extraction again to generate the final results')
|
185 |
+
|
186 |
+
|
187 |
|
188 |
doc = st.text_area("Enter a custom document")
|
189 |
if doc:
|
190 |
+
t5_keywords_df, original_keywords_df, unique_keywords_df, total_end = check_document_length(doc, number_of_paraphrases)
|
191 |
+
|
192 |
# extract_paraphrased_article(input_list[0])
|
193 |
st.text(f'PIPELINE RUNTIME: {total_end}\n')
|
194 |
+
|
195 |
st.subheader('\nOriginal Keywords Extracted:\n\n')
|
196 |
st.dataframe(original_keywords_df)
|
197 |
+
|
198 |
st.subheader('\nT5 Unique New Keywords Extracted:\n\n')
|
199 |
st.dataframe(unique_keywords_df)
|
200 |
+
|
201 |
st.subheader('\nT5 Keywords Extracted:\n\n')
|
202 |
st.dataframe(t5_keywords_df)
|
|