from textwrap import wrap from sumy.parsers import DocumentParser from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.nlp.stemmers import Stemmer from sumy.summarizers.lsa import LsaSummarizer from sumy.utils import get_stop_words from transformers import Pipeline class Summarizer: DEFAULT_LANGUAGE = "english" DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10 def __init__(self, pipeline: Pipeline): self.pipeline = pipeline stemmer = Stemmer(Summarizer.DEFAULT_LANGUAGE) self.lsa_summarizer = LsaSummarizer(stemmer) self.lsa_summarizer.stop_words = get_stop_words(language=Summarizer.DEFAULT_LANGUAGE) @staticmethod def sentence_list(summarized_sentences) -> list: summarized_list = [] for sentence in summarized_sentences: summarized_list.append(sentence._text) return summarized_list def __extractive_summary(self, parser: DocumentParser, sentences_count): summarized_sentences = self.lsa_summarizer(parser.document, sentences_count) summarized_list = Summarizer.sentence_list(summarized_sentences) all_sentences_list = Summarizer.sentence_list(parser.document.sentences) return all_sentences_list, summarized_list def extractive_summary_from_text(self, text: str, sentences_count: int) -> (list, list): parser = PlaintextParser.from_string(text, Tokenizer(Summarizer.DEFAULT_LANGUAGE)) return self.__extractive_summary(parser, sentences_count) def extractive_summary_from_url(self, url: str, sentences_count: int) -> (list, list): parser = HtmlParser.from_url(url, Tokenizer(Summarizer.DEFAULT_LANGUAGE)) return self.__extractive_summary(parser, sentences_count) def abstractive_summary(self, summary: str) -> str: summary_text = " ".join([result['summary_text'] for result in self.pipeline(wrap(summary, 2048))]) return summary_text