Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -10,12 +10,19 @@ from transformers import T5ForConditionalGeneration,T5Tokenizer
|
|
10 |
|
11 |
import nltk
|
12 |
from nltk.tokenize import sent_tokenize
|
13 |
-
nltk.download('stopwords')
|
14 |
-
nltk.download('punkt')
|
15 |
|
16 |
from huggingface_hub import snapshot_download, HfFolder
|
17 |
import streamlit as st
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
20 |
|
21 |
HfFolder.save_token(st.secrets["hf-auth-token"])
|
@@ -23,85 +30,101 @@ HfFolder.save_token(st.secrets["hf-auth-token"])
|
|
23 |
|
24 |
@st.cache(allow_output_mutation=True)
|
25 |
def load_model():
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
# Load T5 for Paraphrasing
|
31 |
-
t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
|
32 |
-
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
|
33 |
-
t5_model = t5_model.to(device)
|
34 |
-
return kw_extractor, t5_model, t5_tokenizer
|
35 |
|
36 |
kw_extractor, t5_model, t5_tokenizer = load_model()
|
37 |
|
38 |
|
39 |
@st.cache()
|
40 |
def get_keybert_results_with_vectorizer(text, number_of_results=20):
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
|
45 |
@st.cache()
|
46 |
def t5_paraphraser(text, number_of_results=5):
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
-
return final_outputs
|
68 |
|
69 |
|
70 |
#### Extract Sentences with Keywords -> Paraphrase multiple versions -> Extract Keywords again
|
71 |
def extract_paraphrased_sentences(article):
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
105 |
|
106 |
doc = st.text_area("Enter a custom document")
|
107 |
|
|
|
10 |
|
11 |
import nltk
|
12 |
from nltk.tokenize import sent_tokenize
|
|
|
|
|
13 |
|
14 |
from huggingface_hub import snapshot_download, HfFolder
|
15 |
import streamlit as st
|
16 |
|
17 |
+
import traceback
|
18 |
+
import logging
|
19 |
+
|
20 |
+
|
21 |
+
nltk.download('stopwords')
|
22 |
+
nltk.download('punkt')
|
23 |
+
|
24 |
+
logger = logging.getLogger(__name__)
|
25 |
+
|
26 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
27 |
|
28 |
HfFolder.save_token(st.secrets["hf-auth-token"])
|
|
|
30 |
|
31 |
@st.cache(allow_output_mutation=True)
|
32 |
def load_model():
|
33 |
+
try:
|
34 |
+
# Load KeyBert Model
|
35 |
+
tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
|
36 |
+
kw_extractor = KeyBERT(tmp_model)
|
37 |
+
|
38 |
+
# Load T5 for Paraphrasing
|
39 |
+
t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
|
40 |
+
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
|
41 |
+
t5_model = t5_model.to(device)
|
42 |
+
return kw_extractor, t5_model, t5_tokenizer
|
43 |
+
except Exception:
|
44 |
+
st.error('Error Loading Models. Please contact admin')
|
45 |
+
logger.error(traceback.format_exc())
|
46 |
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
kw_extractor, t5_model, t5_tokenizer = load_model()
|
49 |
|
50 |
|
51 |
@st.cache()
|
52 |
def get_keybert_results_with_vectorizer(text, number_of_results=20):
|
53 |
+
|
54 |
+
try:
|
55 |
+
keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=number_of_results)
|
56 |
+
return keywords
|
57 |
+
except Exception:
|
58 |
+
st.error('Error running Keybert. Please contact admin')
|
59 |
+
logger.error(traceback.format_exc())
|
60 |
+
|
61 |
|
62 |
|
63 |
@st.cache()
|
64 |
def t5_paraphraser(text, number_of_results=5):
|
65 |
+
try:
|
66 |
+
text = "paraphrase: " + text + " </s>"
|
67 |
+
max_len = 2048
|
68 |
+
encoding = t5_tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
|
69 |
+
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
|
70 |
+
|
71 |
+
beam_outputs = t5_model.generate(
|
72 |
+
input_ids=input_ids, attention_mask=attention_masks,
|
73 |
+
do_sample=True,
|
74 |
+
max_length=2048,
|
75 |
+
top_k=50,
|
76 |
+
top_p=0.95,
|
77 |
+
early_stopping=True,
|
78 |
+
num_return_sequences=number_of_results
|
79 |
+
)
|
80 |
+
|
81 |
+
final_outputs =[]
|
82 |
+
for beam_output in beam_outputs:
|
83 |
+
sent = t5_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
84 |
+
final_outputs.append(sent)
|
85 |
+
return final_outputs
|
86 |
+
except Exception:
|
87 |
+
st.error('Error running T5 Paraphrasing. Please contact admin')
|
88 |
+
logger.error(traceback.format_exc())
|
89 |
|
|
|
90 |
|
91 |
|
92 |
#### Extract Sentences with Keywords -> Paraphrase multiple versions -> Extract Keywords again
|
93 |
def extract_paraphrased_sentences(article):
|
94 |
+
try:
|
95 |
+
start1 = time.time()
|
96 |
+
with st.spinner('Extraction Keywords from Original Document...'):
|
97 |
+
original_keywords = get_keybert_results_with_vectorizer(article)
|
98 |
+
|
99 |
+
article_sentences = sent_tokenize(article)
|
100 |
+
target_sentences = [sent for sent in article_sentences if any(kw[0] in sent for kw in original_keywords)]
|
101 |
+
st.success('Keyword Extraction from Original Document finished in {}'.format(time.time() - start1))
|
102 |
+
|
103 |
+
start2 = time.time()
|
104 |
+
with st.spinner('Extracting Keywords from Paraphrased Target Sentences...'):
|
105 |
+
t5_paraphrasing_keywords = []
|
106 |
+
|
107 |
+
for sent in target_sentences:
|
108 |
+
### T5
|
109 |
+
t5_paraphrased = t5_paraphraser(sent)
|
110 |
+
t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
|
111 |
+
t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
|
112 |
+
|
113 |
+
t5_paraphrasing_keywords.extend(t5_keywords)
|
114 |
+
st.success('Keyword Extraction from Paraphrased Target Sentences finished in {}'.format(time.time() - start2))
|
115 |
+
|
116 |
+
original_keywords_df = pd.DataFrame(original_keywords, columns=['Keyword', 'Score'])
|
117 |
+
|
118 |
+
t5_keywords_df = pd.DataFrame(t5_paraphrasing_keywords, columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
|
119 |
+
|
120 |
+
unique_keywords_df = pd.DataFrame([i for i in t5_paraphrasing_keywords if not original_keywords_df['Keyword'].str.contains(i[0]).any()], columns=['Keyword', 'Score']).sort_values(by='Score', ascending=False).drop_duplicates(subset=['Keyword'], keep='first').reset_index(drop=True)
|
121 |
+
|
122 |
+
total_end = time.time()-start1
|
123 |
+
|
124 |
+
return t5_keywords_df, original_keywords_df, unique_keywords_df, total_end
|
125 |
+
except Exception:
|
126 |
+
st.error('Error running Extraction Pipeline. Please contact admin')
|
127 |
+
logger.error(traceback.format_exc())
|
128 |
|
129 |
doc = st.text_area("Enter a custom document")
|
130 |
|