Spaces:

ml6team
/

distilbart-tos-summarizer-tosdr

Build error

App Files Files Community

distilbart-tos-summarizer-tosdr / Summarizer.py

sdhanabal1

Tune length parameters so that token size don't exceed 1024 which is the model limit

4f3c9ea almost 3 years ago

raw

history blame

3.63 kB

	import string

	import nltk

	from sumy.parsers import DocumentParser
	from sumy.parsers.html import HtmlParser
	from sumy.parsers.plaintext import PlaintextParser
	from sumy.nlp.tokenizers import Tokenizer
	from sumy.nlp.stemmers import Stemmer
	from sumy.summarizers.lsa import LsaSummarizer
	from sumy.utils import get_stop_words
	from transformers import Pipeline


	class Summarizer:
	DEFAULT_LANGUAGE = "english"
	DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 15

	def __init__(self, pipeline: Pipeline):
	self.pipeline = pipeline
	stemmer = Stemmer(Summarizer.DEFAULT_LANGUAGE)
	self.lsa_summarizer = LsaSummarizer(stemmer)
	self.lsa_summarizer.stop_words = get_stop_words(language=Summarizer.DEFAULT_LANGUAGE)

	@staticmethod
	def sentence_list(summarized_sentences) -> list:
	summarized_list = []
	for sentence in summarized_sentences:
	summarized_list.append(sentence._text)
	return summarized_list

	@staticmethod
	def join_sentences(summary_sentences: list) -> str:
	return " ".join([sentence for sentence in summary_sentences])

	@staticmethod
	def split_sentences_by_token_length(summary_sentences: list, split_token_length: int) -> list:
	accumulated_lists = []
	result_list = []
	cumulative_token_length = 0
	for sentence in summary_sentences:
	token_list = [token for token in nltk.word_tokenize(sentence) if token not in ['.']]
	token_length = len(token_list)
	if token_length + cumulative_token_length > split_token_length and result_list:
	accumulated_lists.append(Summarizer.join_sentences(result_list))
	result_list = [sentence]
	cumulative_token_length = token_length
	else:
	result_list.append(sentence)
	cumulative_token_length += token_length
	if result_list:
	accumulated_lists.append(Summarizer.join_sentences(result_list))
	return accumulated_lists

	def __extractive_summary(self, parser: DocumentParser, sentences_count) -> list:
	summarized_sentences = self.lsa_summarizer(parser.document, sentences_count)
	summarized_list = Summarizer.sentence_list(summarized_sentences)
	return summarized_list

	def extractive_summary_from_text(self, text: str, sentences_count: int) -> list:
	parser = PlaintextParser.from_string(text, Tokenizer(Summarizer.DEFAULT_LANGUAGE))
	return self.__extractive_summary(parser, sentences_count)

	def extractive_summary_from_url(self, url: str, sentences_count: int) -> list:
	parser = HtmlParser.from_url(url, Tokenizer(Summarizer.DEFAULT_LANGUAGE))
	return self.__extractive_summary(parser, sentences_count)

	def abstractive_summary(self, extract_summary_sentences: list) -> list:
	"""
	:param extract_summary_sentences: Extractive summary of sentences after Latent semantic analysis
	:return: List of abstractive summary of sentences after calling distilbart-tos-summarizer-tosdr tokenizer
	"""
	wrapped_sentences = Summarizer.split_sentences_by_token_length(extract_summary_sentences,
	split_token_length=600)
	# The ml6team/distilbart-tos-summarizer-tosdr tokenizer supports a max of 1024 tokens per input
	abstractive_summary_list = []
	for result in self.pipeline(wrapped_sentences, min_length=32, max_length=512):
	abstractive_summary_list.append(result['summary_text'])
	return abstractive_summary_list