Spaces:
Build error
Build error
File size: 3,618 Bytes
c98407b abcaca9 c98407b abcaca9 4372d93 abcaca9 99b1da3 4372d93 99b1da3 4f3c9ea 4372d93 4f3c9ea 99b1da3 4f3c9ea 99b1da3 4f3c9ea 99b1da3 6977cda abcaca9 6977cda abcaca9 6977cda abcaca9 6977cda abcaca9 6977cda 99b1da3 4f3c9ea 99b1da3 6977cda 4f3c9ea 6977cda |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import nltk
from sumy.parsers import DocumentParser
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.utils import get_stop_words
from transformers import Pipeline
class Summarizer:
DEFAULT_LANGUAGE = "english"
DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 15
def __init__(self, pipeline: Pipeline):
self.pipeline = pipeline
stemmer = Stemmer(Summarizer.DEFAULT_LANGUAGE)
self.lsa_summarizer = LsaSummarizer(stemmer)
self.lsa_summarizer.stop_words = get_stop_words(language=Summarizer.DEFAULT_LANGUAGE)
@staticmethod
def sentence_list(summarized_sentences) -> list:
summarized_list = []
for sentence in summarized_sentences:
summarized_list.append(sentence._text)
return summarized_list
@staticmethod
def join_sentences(summary_sentences: list) -> str:
return " ".join([sentence for sentence in summary_sentences])
@staticmethod
def split_sentences_by_token_length(summary_sentences: list, split_token_length: int) -> list:
accumulated_lists = []
result_list = []
cumulative_token_length = 0
for sentence in summary_sentences:
token_list = [token for token in nltk.word_tokenize(sentence) if token not in ['.']]
token_length = len(token_list)
if token_length + cumulative_token_length > split_token_length and result_list:
accumulated_lists.append(Summarizer.join_sentences(result_list))
result_list = [sentence]
cumulative_token_length = token_length
else:
result_list.append(sentence)
cumulative_token_length += token_length
if result_list:
accumulated_lists.append(Summarizer.join_sentences(result_list))
return accumulated_lists
def __extractive_summary(self, parser: DocumentParser, sentences_count) -> list:
summarized_sentences = self.lsa_summarizer(parser.document, sentences_count)
summarized_list = Summarizer.sentence_list(summarized_sentences)
return summarized_list
def extractive_summary_from_text(self, text: str, sentences_count: int) -> list:
parser = PlaintextParser.from_string(text, Tokenizer(Summarizer.DEFAULT_LANGUAGE))
return self.__extractive_summary(parser, sentences_count)
def extractive_summary_from_url(self, url: str, sentences_count: int) -> list:
parser = HtmlParser.from_url(url, Tokenizer(Summarizer.DEFAULT_LANGUAGE))
return self.__extractive_summary(parser, sentences_count)
def abstractive_summary(self, extract_summary_sentences: list) -> list:
"""
:param extract_summary_sentences: Extractive summary of sentences after Latent semantic analysis
:return: List of abstractive summary of sentences after calling distilbart-tos-summarizer-tosdr tokenizer
"""
wrapped_sentences = Summarizer.split_sentences_by_token_length(extract_summary_sentences,
split_token_length=600)
# The ml6team/distilbart-tos-summarizer-tosdr tokenizer supports a max of 1024 tokens per input
abstractive_summary_list = []
for result in self.pipeline(wrapped_sentences, min_length=32, max_length=512):
abstractive_summary_list.append(result['summary_text'])
return abstractive_summary_list
|