Spaces:
Sleeping
Sleeping
Created app.py
Browse files
app.py
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
import nltk
|
6 |
+
nltk.download('stopwords')
|
7 |
+
nltk.download('punkt')
|
8 |
+
from nltk.corpus import stopwords
|
9 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
10 |
+
from nltk.cluster.util import cosine_distance
|
11 |
+
import networkx as nx
|
12 |
+
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
|
13 |
+
from transformers import BartTokenizer, BartForConditionalGeneration
|
14 |
+
|
15 |
+
|
16 |
+
#Checking if gpu is available otherwise using the cpu
|
17 |
+
|
18 |
+
if torch.cuda.is_available():
|
19 |
+
device = torch.device("cuda")
|
20 |
+
else:
|
21 |
+
device = torch.device("cpu")
|
22 |
+
|
23 |
+
#Loading the pegasus and bart model. Using cache to store the loaded data so that it doesn't execute after every runtime.
|
24 |
+
|
25 |
+
@st.cache(allow_output_mutation=True)
|
26 |
+
def load_pegasus_model():
|
27 |
+
model_name = "google/pegasus-xsum"
|
28 |
+
summarizer = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
|
29 |
+
tokenizer = PegasusTokenizer.from_pretrained(model_name)
|
30 |
+
return summarizer, tokenizer
|
31 |
+
|
32 |
+
@st.cache(allow_output_mutation=True)
|
33 |
+
def load_bart_model():
|
34 |
+
model_name = "facebook/bart-large-cnn"
|
35 |
+
summarizer = BartForConditionalGeneration.from_pretrained(model_name).to(device)
|
36 |
+
tokenizer = BartTokenizer.from_pretrained(model_name)
|
37 |
+
return summarizer, tokenizer
|
38 |
+
|
39 |
+
# EXTRACTIVE SUMMARY
|
40 |
+
|
41 |
+
#Cleaning the given text so that it gives better output for the extractive summarisation
|
42 |
+
|
43 |
+
def clean_text(text):
|
44 |
+
article = text.split(".")
|
45 |
+
article=[sentence for sentence in article if sentence!=""]
|
46 |
+
# print(article)
|
47 |
+
|
48 |
+
sentences = []
|
49 |
+
|
50 |
+
for sentence in article:
|
51 |
+
#print(sentence)
|
52 |
+
sentence=sentence.replace(",", " , ").replace("'", " ' ").split(" ")
|
53 |
+
#sentence=sentence.replace("[^a-zA-Z]", " ").split(" ")
|
54 |
+
sentence=[word for word in sentence if word!=""]
|
55 |
+
sentences.append(sentence)
|
56 |
+
|
57 |
+
return sentences
|
58 |
+
|
59 |
+
def sentence_similarity(sent1, sent2, stopwords): #Creating words in sentences to one hot encoding and then finding cosine distance between the vectors inorder to measure closeness
|
60 |
+
|
61 |
+
if stopwords is None:
|
62 |
+
stopwords = []
|
63 |
+
|
64 |
+
sent1 = [w.lower() for w in sent1]
|
65 |
+
sent2 = [w.lower() for w in sent2]
|
66 |
+
|
67 |
+
all_words = list(set(sent1 + sent2))
|
68 |
+
|
69 |
+
vector1 = [0] * len(all_words)
|
70 |
+
vector2 = [0] * len(all_words)
|
71 |
+
|
72 |
+
# build the vector for the first sentence
|
73 |
+
for w in sent1:
|
74 |
+
if w in stopwords:
|
75 |
+
continue
|
76 |
+
vector1[all_words.index(w)] += 1
|
77 |
+
|
78 |
+
# build the vector for the second sentence
|
79 |
+
for w in sent2:
|
80 |
+
if w in stopwords:
|
81 |
+
continue
|
82 |
+
vector2[all_words.index(w)] += 1
|
83 |
+
|
84 |
+
return 1 - cosine_distance(vector1, vector2)
|
85 |
+
|
86 |
+
|
87 |
+
def build_similarity_matrix(sentences, stop_words):
|
88 |
+
|
89 |
+
# Create an empty similarity matrix
|
90 |
+
similarity_matrix = np.zeros((len(sentences), len(sentences)))
|
91 |
+
|
92 |
+
for idx1 in range(len(sentences)):
|
93 |
+
for idx2 in range(len(sentences)):
|
94 |
+
if idx1 == idx2: #ignore if both are same sentences
|
95 |
+
continue
|
96 |
+
similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
|
97 |
+
|
98 |
+
return similarity_matrix
|
99 |
+
|
100 |
+
def generate_summary(text, top_n='2'):
|
101 |
+
|
102 |
+
if top_n== None or top_n=="":
|
103 |
+
top_n=2
|
104 |
+
top_n=int(top_n)
|
105 |
+
# Step 1 - Clean text to generate sentences
|
106 |
+
|
107 |
+
sentences=clean_text(text)
|
108 |
+
stop_words = stopwords.words('english')
|
109 |
+
stop_words.append(".")
|
110 |
+
stop_words.append(",")
|
111 |
+
summarize_text = []
|
112 |
+
|
113 |
+
# Step 2 - Generate Similary Martix across sentences
|
114 |
+
|
115 |
+
sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)
|
116 |
+
# print(sentence_similarity_martix)
|
117 |
+
|
118 |
+
# Step 3 - Rank sentences in similarity martix
|
119 |
+
|
120 |
+
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
|
121 |
+
# print(sentence_similarity_graph)
|
122 |
+
|
123 |
+
scores = nx.pagerank(sentence_similarity_graph)
|
124 |
+
# print(scores)
|
125 |
+
|
126 |
+
# Step 4 - Sort the rank and pick top sentences
|
127 |
+
|
128 |
+
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True) #Sorting the scores in decending order
|
129 |
+
# print("Indexes of top ranked_sentence order are ", ranked_sentence)
|
130 |
+
|
131 |
+
for i in range(top_n):
|
132 |
+
ranked_sentence[i][1][0]=ranked_sentence[i][1][0].capitalize() #Capitalising 1st letter of sentence
|
133 |
+
# print(ranked_sentence[i][1][0])
|
134 |
+
summarize_text.append(" ".join(ranked_sentence[i][1]))
|
135 |
+
|
136 |
+
# Step 5 - Offcourse, output the summarized text
|
137 |
+
|
138 |
+
extractive_summarized=". ".join(summarize_text).replace(" , ",", ").replace(" ' ","'") + "."
|
139 |
+
return extractive_summarized
|
140 |
+
|
141 |
+
|
142 |
+
|
143 |
+
#ABSTRACTIVE SUMMARY
|
144 |
+
|
145 |
+
#Converting sentence into tokens and then extracting the output from the tokens
|
146 |
+
|
147 |
+
def run_model(model,input_text,min_length=30,max_length=128,num_return_sequences = 1):
|
148 |
+
if model == "Bart":
|
149 |
+
bart_model,bart_tokenizer=load_bart_model()
|
150 |
+
input_text = ' '.join(input_text.split())
|
151 |
+
input_tokenized = bart_tokenizer.encode(input_text, return_tensors='pt').to(device)
|
152 |
+
summary_ids = bart_model.generate(input_tokenized,
|
153 |
+
num_beams = 4,
|
154 |
+
num_return_sequences = num_return_sequences,
|
155 |
+
no_repeat_ngram_size = 2,
|
156 |
+
length_penalty = 1,
|
157 |
+
min_length = min_length,
|
158 |
+
max_length = max_length,
|
159 |
+
early_stopping = True)
|
160 |
+
|
161 |
+
output = [bart_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]
|
162 |
+
st.write('Summary')
|
163 |
+
st.success(output)
|
164 |
+
else:
|
165 |
+
#pegasus_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum").to(device)
|
166 |
+
#pegasus_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
|
167 |
+
pegasus_model,pegasus_tokenizer=load_pegasus_model()
|
168 |
+
input_text = ' '.join(input_text.split())
|
169 |
+
batch = pegasus_tokenizer.prepare_seq2seq_batch(input_text, truncation=True, padding='longest', return_tensors="pt").to(device)
|
170 |
+
|
171 |
+
summary_ids = pegasus_model.generate(**batch,
|
172 |
+
num_beams=10,
|
173 |
+
num_return_sequences=num_return_sequences,
|
174 |
+
no_repeat_ngram_size = 2,
|
175 |
+
length_penalty = 1,
|
176 |
+
min_length = min_length,
|
177 |
+
max_length = max_length,
|
178 |
+
early_stopping = True)
|
179 |
+
|
180 |
+
output = [pegasus_tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)][0][0]
|
181 |
+
st.write("Summary")
|
182 |
+
st.success(output)
|
183 |
+
|
184 |
+
def main():
|
185 |
+
#text = """In an attempt to build an AI-ready workforce, Microsoft announced Intelligent Cloud Hub which has been launched to empower the next generation of students with AI-ready skills. Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services. As part of the program, the Redmond giant which wants to expand its reach and is planning to build a strong developer ecosystem in India with the program will set up the core AI infrastructure and IoT Hub for the selected campuses. The company will provide AI development tools and Azure AI services such as Microsoft Cognitive Services, Bot Services and Azure Machine Learning.According to Manish Prakash, Country General Manager-PS, Health and Education, Microsoft India, said, "With AI being the defining technology of our time, it is transforming lives and industry and the jobs of tomorrow will require a different skillset. This will require more collaborations and training and working with AI. That’s why it has become more critical than ever for educational institutions to integrate new cloud and AI technologies. The program is an attempt to ramp up the institutional set-up and build capabilities among the educators to educate the workforce of tomorrow." The program aims to build up the cognitive skills and in-depth understanding of developing intelligent cloud connected solutions for applications across industry. Earlier in April this year, the company announced Microsoft Professional Program In AI as a learning track open to the public. The program was developed to provide job ready skills to programmers who wanted to hone their skills in AI and data science with a series of online courses which featured hands-on labs and expert instructors as well. This program also included developer-focused AI school that provided a bunch of assets to help build AI skills."""
|
186 |
+
|
187 |
+
st.title('Text Summarizer')
|
188 |
+
text=st.text_input("Enter Text")
|
189 |
+
|
190 |
+
|
191 |
+
|
192 |
+
extractive_summary=""
|
193 |
+
abstractive_summary=""
|
194 |
+
|
195 |
+
Summary = st.selectbox('Select Summary', ["Extractive Summary","Abstractive Summary"],key="Summary")
|
196 |
+
if Summary=="Extractive Summary":
|
197 |
+
with st.form("my_form"):
|
198 |
+
st.write("Extractive Summary")
|
199 |
+
no_of_sentences=st.text_input("Enter no of sentences to be summarised in (for extractive mode)",placeholder="Default is 2")
|
200 |
+
submit_button = st.form_submit_button("Submit")
|
201 |
+
if submit_button:
|
202 |
+
extractive_summary=generate_summary(text, no_of_sentences)
|
203 |
+
st.success(extractive_summary)
|
204 |
+
else:
|
205 |
+
with st.form("my_form2"):
|
206 |
+
st.write("Abstractive Summary")
|
207 |
+
model = st.selectbox('Model for abstractive Summary', ["Bart","Pegasus"])
|
208 |
+
min_length = st.slider('minimum length of summary(Words)', 5, 70, 30,1)
|
209 |
+
max_length = st.slider('maximum length of summary(Words)', 70, 150, 128,1)
|
210 |
+
#num_return_sequences= st.slider('No of summaries to return', 1, 10, 1,1)
|
211 |
+
submit_button = st.form_submit_button("Submit")
|
212 |
+
if submit_button:
|
213 |
+
run_model(model,text,min_length,max_length)
|
214 |
+
|
215 |
+
|
216 |
+
|
217 |
+
|
218 |
+
|
219 |
+
if __name__== '__main__':
|
220 |
+
main()
|