sdhanabal1's picture
Tune pipeline parameters
338f4fe
raw
history blame
2.18 kB
from textwrap import wrap
from sumy.parsers import DocumentParser
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.utils import get_stop_words
from transformers import Pipeline
class Summarizer:
DEFAULT_LANGUAGE = "english"
DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
def __init__(self, pipeline: Pipeline):
self.pipeline = pipeline
stemmer = Stemmer(Summarizer.DEFAULT_LANGUAGE)
self.lsa_summarizer = LsaSummarizer(stemmer)
self.lsa_summarizer.stop_words = get_stop_words(language=Summarizer.DEFAULT_LANGUAGE)
@staticmethod
def sentence_list(summarized_sentences) -> list:
summarized_list = []
for sentence in summarized_sentences:
summarized_list.append(sentence._text)
return summarized_list
def __extractive_summary(self, parser: DocumentParser, sentences_count) -> list:
summarized_sentences = self.lsa_summarizer(parser.document, sentences_count)
summarized_list = Summarizer.sentence_list(summarized_sentences)
return summarized_list
def extractive_summary_from_text(self, text: str, sentences_count: int) -> list:
parser = PlaintextParser.from_string(text, Tokenizer(Summarizer.DEFAULT_LANGUAGE))
return self.__extractive_summary(parser, sentences_count)
def extractive_summary_from_url(self, url: str, sentences_count: int) -> list:
parser = HtmlParser.from_url(url, Tokenizer(Summarizer.DEFAULT_LANGUAGE))
return self.__extractive_summary(parser, sentences_count)
def abstractive_summary(self, extract_summary_sentences: list) -> list:
extract_summary = " ".join([sentence for sentence in extract_summary_sentences])
wrapped_sentences = wrap(extract_summary, 2048)
abstractive_summary_list = []
for result in self.pipeline(wrapped_sentences, min_length=5, max_length=512):
abstractive_summary_list.append(result['summary_text'])
return abstractive_summary_list