#import nltk #nltk.download('punkt') from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lex_rank import LexRankSummarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words def get_Summary(in_text): sentences = in_text.split('. ') # summarize small part of the text nr_sentences = 1 #len(sentences) print('nr_sentences: '+str(nr_sentences)) if nr_sentences == 0: return 'Error: No sentences available', None list_summary = get_Lexrank(in_text,nr_sentences) # it can happen that for lexrank a sentence consists of multiple actual sentences, # that are separated with full stops. Then the correspoinding timestamp cannot be found # all items from the lexrank summary must be concatinated and split up by full stops. concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ') return concat_list_summary def get_Lexrank(text, nr_sentences): summary=[] LANGUAGE = "english" SENTENCES_COUNT = nr_sentences parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): summary.append(sentence) return summary