Summarizer / app.py
Pratick's picture
Update app.py
df24c7f
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en import English
import numpy as np
import gradio as gr
nlp = English()
nlp.create_pipe('sentencizer')
def summarizer(text, tokenizer, max_sent_in_summary=5):
# Create spacy document for further sentence level tokenization
doc = nlp(text.replace("\n", ""))
sentences = [sent.text.strip() for sent in doc.sents]
# Let's create an organizer which will store the sentence ordering to later reorganize the
# scored sentences in their correct order
sentence_organizer = {k:v for v,k in enumerate(sentences)}
# Let's now create a tf-idf (Term frequnecy Inverse Document Frequency) model
tf_idf_vectorizer = TfidfVectorizer(min_df=2, max_features=None,
strip_accents='unicode',
analyzer='word',
token_pattern=r'\w{1,}',
ngram_range=(1, 3),
use_idf=1,smooth_idf=1,
sublinear_tf=1,
stop_words = 'english')
# Passing our sentences treating each as one document to TF-IDF vectorizer
tf_idf_vectorizer.fit(sentences)
# Transforming our sentences to TF-IDF vectors
sentence_vectors = tf_idf_vectorizer.transform(sentences)
# Getting sentence scores for each sentences
sentence_scores = np.array(sentence_vectors.sum(axis=1)).ravel()
# Getting top-n sentences
N = max_sent_in_summary
top_n_sentences = [sentences[ind] for ind in np.argsort(sentence_scores, axis=0)[::-1][:N]]
# Let's now do the sentence ordering using our prebaked sentence_organizer
# Let's map the scored sentences with their indexes
mapped_top_n_sentences = [(sentence,sentence_organizer[sentence]) for sentence in top_n_sentences]
# Ordering our top-n sentences in their original ordering
mapped_top_n_sentences = sorted(mapped_top_n_sentences, key = lambda x: x[1])
ordered_scored_sentences = [element[0] for element in mapped_top_n_sentences]
# Our final summary
summary = " ".join(ordered_scored_sentences)
return summary
demo = gr.Interface(
fn=summarizer,
inputs=["text","text"],
outputs="text",
)
demo.launch(debug=True)