Spaces:
Sleeping
Sleeping
#import nltk | |
#nltk.download('punkt') | |
from sumy.parsers.html import HtmlParser | |
from sumy.parsers.plaintext import PlaintextParser | |
from sumy.nlp.tokenizers import Tokenizer | |
from sumy.summarizers.lex_rank import LexRankSummarizer | |
from sumy.nlp.stemmers import Stemmer | |
from sumy.utils import get_stop_words | |
def get_Summary(in_text): | |
sentences = in_text.split('. ') | |
# summarize small part of the text | |
nr_sentences = 1 #len(sentences) | |
print('nr_sentences: '+str(nr_sentences)) | |
if nr_sentences == 0: | |
return 'Error: No sentences available', None | |
list_summary = get_Lexrank(in_text,nr_sentences) | |
# it can happen that for lexrank a sentence consists of multiple actual sentences, | |
# that are separated with full stops. Then the correspoinding timestamp cannot be found | |
# all items from the lexrank summary must be concatinated and split up by full stops. | |
concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ') | |
return concat_list_summary | |
def get_Lexrank(text, nr_sentences): | |
summary=[] | |
LANGUAGE = "english" | |
SENTENCES_COUNT = nr_sentences | |
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) | |
stemmer = Stemmer(LANGUAGE) | |
summarizer = LexRankSummarizer(stemmer) | |
summarizer.stop_words = get_stop_words(LANGUAGE) | |
for sentence in summarizer(parser.document, SENTENCES_COUNT): | |
summary.append(sentence) | |
return summary | |